diff --git a/README.md b/README.md index 9995b6b184..f1f3a25550 100644 --- a/README.md +++ b/README.md @@ -287,8 +287,8 @@ pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf sentence_0 = "This research was consistent with his findings." sentence_1 = "His findings were compatible with this research." sentence_2 = "His findings were not compatible with this research." -inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt') -inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt') +inputs_1 = tokenizer(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt') +inputs_2 = tokenizer(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt') pred_1 = pytorch_model(inputs_1['input_ids'], token_type_ids=inputs_1['token_type_ids'])[0].argmax().item() pred_2 = pytorch_model(inputs_2['input_ids'], token_type_ids=inputs_2['token_type_ids'])[0].argmax().item() diff --git a/docs/README.md b/docs/README.md index 1cfd8e01e4..6da2f78f3a 100644 --- a/docs/README.md +++ b/docs/README.md @@ -167,7 +167,7 @@ Here's an example showcasing everything so far: Indices can be obtained using :class:`transformers.AlbertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ ``` diff --git a/docs/source/main_classes/tokenizer.rst b/docs/source/main_classes/tokenizer.rst index 7a81c93624..ee12da1847 100644 --- a/docs/source/main_classes/tokenizer.rst +++ b/docs/source/main_classes/tokenizer.rst @@ -11,7 +11,7 @@ The base classes ``PreTrainedTokenizer`` and ``PreTrainedTokenizerFast`` impleme - adding new tokens to the vocabulary in a way that is independant of the underlying structure (BPE, SentencePiece...), - managing special tokens like mask, beginning-of-sentence, etc tokens (adding them, assigning them to attributes in the tokenizer for easy access and making sure they are not split during tokenization) -``BatchEncoding`` holds the output of the tokenizer's encoding methods (``encode_plus`` and ``batch_encode_plus``) and is derived from a Python dictionary. When the tokenizer is a pure python tokenizer, this class behave just like a standard python dictionary and hold the various model inputs computed by these methodes (``input_ids``, ``attention_mask``...). When the tokenizer is a "Fast" tokenizer (i.e. backed by HuggingFace tokenizers library), this class provides in addition several advanced alignement methods which can be used to map between the original string (character and words) and the token space (e.g. getting the index of the token comprising a given character or the span of characters corresponding to a given token). +``BatchEncoding`` holds the output of the tokenizer's encoding methods (``__call__``, ``encode_plus`` and ``batch_encode_plus``) and is derived from a Python dictionary. When the tokenizer is a pure python tokenizer, this class behave just like a standard python dictionary and hold the various model inputs computed by these methodes (``input_ids``, ``attention_mask``...). When the tokenizer is a "Fast" tokenizer (i.e. backed by HuggingFace tokenizers library), this class provides in addition several advanced alignement methods which can be used to map between the original string (character and words) and the token space (e.g. getting the index of the token comprising a given character or the span of characters corresponding to a given token). ``PreTrainedTokenizer`` ~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/task_summary.rst b/docs/source/task_summary.rst index d1157ccccb..0a425b52cf 100644 --- a/docs/source/task_summary.rst +++ b/docs/source/task_summary.rst @@ -74,7 +74,7 @@ of each other. The process is the following: with the weights stored in the checkpoint. - Build a sequence from the two sentences, with the correct model-specific separators token type ids and attention masks (:func:`~transformers.PreTrainedTokenizer.encode` and - :func:`~transformers.PreTrainedTokenizer.encode_plus` take care of this) + :func:`~transformers.PreTrainedTokenizer.__call__` take care of this) - Pass this sequence through the model so that it is classified in one of the two available classes: 0 (not a paraphrase) and 1 (is a paraphrase) - Compute the softmax of the result to get probabilities over the classes @@ -95,8 +95,8 @@ of each other. The process is the following: >>> sequence_1 = "Apples are especially bad for your health" >>> sequence_2 = "HuggingFace's headquarters are situated in Manhattan" - >>> paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, return_tensors="pt") - >>> not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="pt") + >>> paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="pt") + >>> not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="pt") >>> paraphrase_classification_logits = model(**paraphrase)[0] >>> not_paraphrase_classification_logits = model(**not_paraphrase)[0] @@ -128,8 +128,8 @@ of each other. The process is the following: >>> sequence_1 = "Apples are especially bad for your health" >>> sequence_2 = "HuggingFace's headquarters are situated in Manhattan" - >>> paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, return_tensors="tf") - >>> not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="tf") + >>> paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="tf") + >>> not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="tf") >>> paraphrase_classification_logits = model(paraphrase)[0] >>> not_paraphrase_classification_logits = model(not_paraphrase)[0] @@ -221,7 +221,7 @@ Here is an example of question answering using a model and a tokenizer. The proc ... ] >>> for question in questions: - ... inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt") + ... inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt") ... input_ids = inputs["input_ids"].tolist()[0] ... ... text_tokens = tokenizer.convert_ids_to_tokens(input_ids) @@ -263,7 +263,7 @@ Here is an example of question answering using a model and a tokenizer. The proc ... ] >>> for question in questions: - ... inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="tf") + ... inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="tf") ... input_ids = inputs["input_ids"].numpy()[0] ... ... text_tokens = tokenizer.convert_ids_to_tokens(input_ids) diff --git a/docs/source/training.rst b/docs/source/training.rst index 5d465f8c37..c497fb4b60 100644 --- a/docs/source/training.rst +++ b/docs/source/training.rst @@ -77,7 +77,7 @@ other than bias and layer normalization terms: optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5) Now we can set up a simple dummy training batch using -:func:`~transformers.PreTrainedTokenizer.batch_encode_plus`. This returns a +:func:`~transformers.PreTrainedTokenizer.__call__`. This returns a :func:`~transformers.BatchEncoding` instance which prepares everything we might need to pass to the model. diff --git a/examples/adversarial/utils_hans.py b/examples/adversarial/utils_hans.py index 5058e8b45f..8f230fad98 100644 --- a/examples/adversarial/utils_hans.py +++ b/examples/adversarial/utils_hans.py @@ -298,12 +298,13 @@ def hans_convert_examples_to_features( if ex_index % 10000 == 0: logger.info("Writing example %d" % (ex_index)) - inputs = tokenizer.encode_plus( + inputs = tokenizer( example.text_a, example.text_b, add_special_tokens=True, max_length=max_length, - pad_to_max_length=True, + padding="max_length", + truncation=True, return_overflowing_tokens=True, ) diff --git a/examples/longform-qa/eli5_utils.py b/examples/longform-qa/eli5_utils.py index 4f7d7a9d46..f6e417a8dd 100644 --- a/examples/longform-qa/eli5_utils.py +++ b/examples/longform-qa/eli5_utils.py @@ -193,12 +193,12 @@ def make_qa_retriever_model(model_name="google/bert_uncased_L-8_H-512_A-8", from def make_qa_retriever_batch(qa_list, tokenizer, max_len=64, device="cuda:0"): q_ls = [q for q, a in qa_list] a_ls = [a for q, a in qa_list] - q_toks = tokenizer.batch_encode_plus(q_ls, max_length=max_len, pad_to_max_length=True) + q_toks = tokenizer(q_ls, max_length=max_len, padding="max_length", truncation=True) q_ids, q_mask = ( torch.LongTensor(q_toks["input_ids"]).to(device), torch.LongTensor(q_toks["attention_mask"]).to(device), ) - a_toks = tokenizer.batch_encode_plus(a_ls, max_length=max_len, pad_to_max_length=True) + a_toks = tokenizer(a_ls, max_length=max_len, padding="max_length", truncation=True) a_ids, a_mask = ( torch.LongTensor(a_toks["input_ids"]).to(device), torch.LongTensor(a_toks["attention_mask"]).to(device), @@ -375,12 +375,12 @@ def make_qa_s2s_model(model_name="facebook/bart-large", from_file=None, device=" def make_qa_s2s_batch(qa_list, tokenizer, max_len=64, max_a_len=360, device="cuda:0"): q_ls = [q for q, a in qa_list] a_ls = [a for q, a in qa_list] - q_toks = tokenizer.batch_encode_plus(q_ls, max_length=max_len, pad_to_max_length=True) + q_toks = tokenizer(q_ls, max_length=max_len, padding="max_length", truncation=True) q_ids, q_mask = ( torch.LongTensor(q_toks["input_ids"]).to(device), torch.LongTensor(q_toks["attention_mask"]).to(device), ) - a_toks = tokenizer.batch_encode_plus(a_ls, max_length=min(max_len, max_a_len), pad_to_max_length=True) + a_toks = tokenizer(a_ls, max_length=min(max_len, max_a_len), padding="max_length", truncation=True) a_ids, a_mask = ( torch.LongTensor(a_toks["input_ids"]).to(device), torch.LongTensor(a_toks["attention_mask"]).to(device), @@ -531,7 +531,7 @@ def qa_s2s_generate( # ELI5-trained retrieval model usage ############### def embed_passages_for_retrieval(passages, tokenizer, qa_embedder, max_length=128, device="cuda:0"): - a_toks = tokenizer.batch_encode_plus(passages, max_length=max_length, pad_to_max_length=True) + a_toks = tokenizer(passages, max_length=max_length, padding="max_length", truncation=True) a_ids, a_mask = ( torch.LongTensor(a_toks["input_ids"]).to(device), torch.LongTensor(a_toks["attention_mask"]).to(device), @@ -542,7 +542,7 @@ def embed_passages_for_retrieval(passages, tokenizer, qa_embedder, max_length=12 def embed_questions_for_retrieval(q_ls, tokenizer, qa_embedder, device="cuda:0"): - q_toks = tokenizer.batch_encode_plus(q_ls, max_length=128, pad_to_max_length=True) + q_toks = tokenizer(q_ls, max_length=128, padding="max_length", truncation=True) q_ids, q_mask = ( torch.LongTensor(q_toks["input_ids"]).to(device), torch.LongTensor(q_toks["attention_mask"]).to(device), diff --git a/examples/movement-pruning/emmental/modeling_bert_masked.py b/examples/movement-pruning/emmental/modeling_bert_masked.py index a87718fa3a..294db863ee 100644 --- a/examples/movement-pruning/emmental/modeling_bert_masked.py +++ b/examples/movement-pruning/emmental/modeling_bert_masked.py @@ -424,7 +424,7 @@ MASKED_BERT_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.BertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/examples/multiple-choice/utils_multiple_choice.py b/examples/multiple-choice/utils_multiple_choice.py index 1b18c6ed63..2f6dd040dc 100644 --- a/examples/multiple-choice/utils_multiple_choice.py +++ b/examples/multiple-choice/utils_multiple_choice.py @@ -510,12 +510,13 @@ def convert_examples_to_features( else: text_b = example.question + " " + ending - inputs = tokenizer.encode_plus( + inputs = tokenizer( text_a, text_b, add_special_tokens=True, max_length=max_length, - pad_to_max_length=True, + padding="max_length", + truncation=True, return_overflowing_tokens=True, ) if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0: diff --git a/examples/seq2seq/run_eval.py b/examples/seq2seq/run_eval.py index 82699d1f22..6a0480f36d 100644 --- a/examples/seq2seq/run_eval.py +++ b/examples/seq2seq/run_eval.py @@ -45,9 +45,9 @@ def generate_summaries_or_translations( for batch in tqdm(list(chunks(examples, batch_size))): if "t5" in model_name: batch = [model.config.prefix + text for text in batch] - batch = tokenizer.batch_encode_plus( - batch, max_length=1024, return_tensors="pt", truncation=True, pad_to_max_length=True - ).to(device) + batch = tokenizer(batch, max_length=1024, return_tensors="pt", truncation=True, padding="max_length").to( + device + ) summaries = model.generate(**batch, **gen_kwargs) dec = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False) for hypothesis in dec: diff --git a/examples/seq2seq/utils.py b/examples/seq2seq/utils.py index 39cfa9d380..99a2abbe20 100644 --- a/examples/seq2seq/utils.py +++ b/examples/seq2seq/utils.py @@ -41,12 +41,12 @@ def encode_file( assert lns, f"found empty file at {data_path}" examples = [] for text in tqdm(lns, desc=f"Tokenizing {data_path.name}"): - tokenized = tokenizer.batch_encode_plus( + tokenized = tokenizer( [text], max_length=max_length, - pad_to_max_length=pad_to_max_length, - add_prefix_space=True, + padding="max_length" if pad_to_max_length else None, truncation=True, + add_prefix_space=True, return_tensors=return_tensors, ) assert tokenized.input_ids.shape[1] == max_length diff --git a/model_cards/SparkBeyond/roberta-large-sts-b/README.md b/model_cards/SparkBeyond/roberta-large-sts-b/README.md index 6fa2fd2a63..a32cb57486 100644 --- a/model_cards/SparkBeyond/roberta-large-sts-b/README.md +++ b/model_cards/SparkBeyond/roberta-large-sts-b/README.md @@ -40,7 +40,7 @@ def roberta_similarity_batches(to_predict): return similarity_scores def similarity_roberta(model, tokenizer, sent_pairs): - batch_token = tokenizer.batch_encode_plus(sent_pairs, pad_to_max_length=True, max_length=500) + batch_token = tokenizer(sent_pairs, padding='max_length', truncation=True, max_length=500) res = model(torch.tensor(batch_token['input_ids']).cuda(), attention_mask=torch.tensor(batch_token["attention_mask"]).cuda()) return res diff --git a/model_cards/a-ware/bart-squadv2/README.md b/model_cards/a-ware/bart-squadv2/README.md index a6e088c721..164c6a220f 100644 --- a/model_cards/a-ware/bart-squadv2/README.md +++ b/model_cards/a-ware/bart-squadv2/README.md @@ -60,7 +60,7 @@ tokenizer = BartTokenizer.from_pretrained('a-ware/bart-squadv2') model = BartForQuestionAnswering.from_pretrained('a-ware/bart-squadv2') question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" -encoding = tokenizer.encode_plus(question, text, return_tensors='pt') +encoding = tokenizer(question, text, return_tensors='pt') input_ids = encoding['input_ids'] attention_mask = encoding['attention_mask'] diff --git a/model_cards/a-ware/xlmroberta-squadv2/README.md b/model_cards/a-ware/xlmroberta-squadv2/README.md index 48e09fccbb..eb08cbbc06 100644 --- a/model_cards/a-ware/xlmroberta-squadv2/README.md +++ b/model_cards/a-ware/xlmroberta-squadv2/README.md @@ -43,7 +43,7 @@ tokenizer = XLMRobertaTokenizer.from_pretrained('a-ware/xlmroberta-squadv2') model = XLMRobertaForQuestionAnswering.from_pretrained('a-ware/xlmroberta-squadv2') question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" -encoding = tokenizer.encode_plus(question, text, return_tensors='pt') +encoding = tokenizer(question, text, return_tensors='pt') input_ids = encoding['input_ids'] attention_mask = encoding['attention_mask'] diff --git a/model_cards/google/reformer-enwik8/README.md b/model_cards/google/reformer-enwik8/README.md index 5086ce80cc..998ac33d9c 100644 --- a/model_cards/google/reformer-enwik8/README.md +++ b/model_cards/google/reformer-enwik8/README.md @@ -14,7 +14,7 @@ Therefore, this model does not need a tokenizer. The following function can inst import torch # Encoding -def encode(list_of_strings, pad_to_max_length=True, pad_token_id=0): +def encode(list_of_strings, pad_token_id=0): max_length = max([len(string) for string in list_of_strings]) # create emtpy tensors diff --git a/model_cards/lserinol/bert-turkish-question-answering/README.md b/model_cards/lserinol/bert-turkish-question-answering/README.md index 5a0a8df935..66a7dc0b24 100644 --- a/model_cards/lserinol/bert-turkish-question-answering/README.md +++ b/model_cards/lserinol/bert-turkish-question-answering/README.md @@ -43,7 +43,7 @@ questions = [ ] for question in questions: - inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt") + inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt") input_ids = inputs["input_ids"].tolist()[0] text_tokens = tokenizer.convert_ids_to_tokens(input_ids) diff --git a/model_cards/mrm8488/longformer-base-4096-finetuned-squadv2/README.md b/model_cards/mrm8488/longformer-base-4096-finetuned-squadv2/README.md index 6c81af1821..f789f6a606 100644 --- a/model_cards/mrm8488/longformer-base-4096-finetuned-squadv2/README.md +++ b/model_cards/mrm8488/longformer-base-4096-finetuned-squadv2/README.md @@ -50,7 +50,7 @@ model = AutoModelForQuestionAnswering.from_pretrained("mrm8488/longformer-base-4 text = "Huggingface has democratized NLP. Huge thanks to Huggingface for this." question = "What has Huggingface done ?" -encoding = tokenizer.encode_plus(question, text, return_tensors="pt") +encoding = tokenizer(question, text, return_tensors="pt") input_ids = encoding["input_ids"] # default is local attention everywhere diff --git a/model_cards/mrm8488/t5-base-finetuned-squadv2/README.md b/model_cards/mrm8488/t5-base-finetuned-squadv2/README.md index 7456f498ce..d072a4bcff 100644 --- a/model_cards/mrm8488/t5-base-finetuned-squadv2/README.md +++ b/model_cards/mrm8488/t5-base-finetuned-squadv2/README.md @@ -55,7 +55,7 @@ model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-squadv2") def get_answer(question, context): input_text = "question: %s context: %s " % (question, context) - features = tokenizer.batch_encode_plus([input_text], return_tensors='pt') + features = tokenizer([input_text], return_tensors='pt') output = model.generate(input_ids=features['input_ids'], attention_mask=features['attention_mask']) diff --git a/model_cards/oliverguhr/german-sentiment-bert/README.md b/model_cards/oliverguhr/german-sentiment-bert/README.md index 30afb653d8..2594aacab5 100644 --- a/model_cards/oliverguhr/german-sentiment-bert/README.md +++ b/model_cards/oliverguhr/german-sentiment-bert/README.md @@ -55,7 +55,7 @@ class SentimentModel(): def predict_sentiment(self, texts: List[str])-> List[str]: texts = [self.clean_text(text) for text in texts] # Add special tokens takes care of adding [CLS], [SEP], ... tokens in the right way for each model. - input_ids = self.tokenizer.batch_encode_plus(texts,pad_to_max_length=True, add_special_tokens=True) + input_ids = self.tokenizer(texts, padding=True, truncation=True, add_special_tokens=True) input_ids = torch.tensor(input_ids["input_ids"]) with torch.no_grad(): diff --git a/model_cards/valhalla/bart-large-finetuned-squadv1/README.md b/model_cards/valhalla/bart-large-finetuned-squadv1/README.md index e2baaa4977..e53087c1ba 100644 --- a/model_cards/valhalla/bart-large-finetuned-squadv1/README.md +++ b/model_cards/valhalla/bart-large-finetuned-squadv1/README.md @@ -50,7 +50,7 @@ tokenizer = BartTokenizer.from_pretrained('valhalla/bart-large-finetuned-squadv1 model = BartForQuestionAnswering.from_pretrained('valhalla/bart-large-finetuned-squadv1') question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" -encoding = tokenizer.encode_plus(question, text, return_tensors='pt') +encoding = tokenizer(question, text, return_tensors='pt') input_ids = encoding['input_ids'] attention_mask = encoding['attention_mask'] diff --git a/model_cards/valhalla/longformer-base-4096-finetuned-squadv1/README.md b/model_cards/valhalla/longformer-base-4096-finetuned-squadv1/README.md index f7328b1cfe..b6b096bf47 100644 --- a/model_cards/valhalla/longformer-base-4096-finetuned-squadv1/README.md +++ b/model_cards/valhalla/longformer-base-4096-finetuned-squadv1/README.md @@ -33,7 +33,7 @@ model = AutoModelForQuestionAnswering.from_pretrained("valhalla/longformer-base- text = "Huggingface has democratized NLP. Huge thanks to Huggingface for this." question = "What has Huggingface done ?" -encoding = tokenizer.encode_plus(question, text, return_tensors="pt") +encoding = tokenizer(question, text, return_tensors="pt") input_ids = encoding["input_ids"] # default is local attention everywhere diff --git a/model_cards/valhalla/t5-base-squad/README.md b/model_cards/valhalla/t5-base-squad/README.md index 16ec7e1ad1..18f314759a 100644 --- a/model_cards/valhalla/t5-base-squad/README.md +++ b/model_cards/valhalla/t5-base-squad/README.md @@ -19,7 +19,7 @@ model = AutoModelWithLMHead.from_pretrained("valhalla/t5-base-squad") def get_answer(question, context): input_text = "question: %s context: %s " % (question, context) - features = tokenizer.batch_encode_plus([input_text], return_tensors='pt') + features = tokenizer([input_text], return_tensors='pt') out = model.generate(input_ids=features['input_ids'], attention_mask=features['attention_mask']) diff --git a/notebooks/02-transformers.ipynb b/notebooks/02-transformers.ipynb index 81615b501e..636d1c738c 100644 --- a/notebooks/02-transformers.ipynb +++ b/notebooks/02-transformers.ipynb @@ -255,7 +255,7 @@ "# tokens_pt = torch.tensor([tokens_ids])\n", "\n", "# This code can be factored into one-line as follow\n", - "tokens_pt2 = tokenizer.encode_plus(\"This is an input example\", return_tensors=\"pt\")\n", + "tokens_pt2 = tokenizer(\"This is an input example\", return_tensors=\"pt\")\n", "\n", "for key, value in tokens_pt2.items():\n", " print(\"{}:\\n\\t{}\".format(key, value))\n", @@ -268,7 +268,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As you can see above, the method `encode_plus` provides a convenient way to generate all the required parameters\n", + "As you can see above, calling the tokenizer provides a convenient way to generate all the required parameters\n", "that will go through the model. \n", "\n", "Moreover, you might have noticed it generated some additional tensors: \n", @@ -302,10 +302,10 @@ ], "source": [ "# Single segment input\n", - "single_seg_input = tokenizer.encode_plus(\"This is a sample input\")\n", + "single_seg_input = tokenizer(\"This is a sample input\")\n", "\n", "# Multiple segment input\n", - "multi_seg_input = tokenizer.encode_plus(\"This is segment A\", \"This is segment B\")\n", + "multi_seg_input = tokenizer(\"This is segment A\", \"This is segment B\")\n", "\n", "print(\"Single segment token (str): {}\".format(tokenizer.convert_ids_to_tokens(single_seg_input['input_ids'])))\n", "print(\"Single segment token (int): {}\".format(single_seg_input['input_ids']))\n", @@ -344,9 +344,9 @@ ], "source": [ "# Padding highlight\n", - "tokens = tokenizer.batch_encode_plus(\n", + "tokens = tokenizer(\n", " [\"This is a sample\", \"This is another longer sample text\"], \n", - " pad_to_max_length=True # First sentence will have some PADDED tokens to match second sequence length\n", + " padding=True # First sentence will have some PADDED tokens to match second sequence length\n", ")\n", "\n", "for i in range(2):\n", @@ -405,8 +405,8 @@ ], "source": [ "# transformers generates a ready to use dictionary with all the required parameters for the specific framework.\n", - "input_tf = tokenizer.encode_plus(\"This is a sample input\", return_tensors=\"tf\")\n", - "input_pt = tokenizer.encode_plus(\"This is a sample input\", return_tensors=\"pt\")\n", + "input_tf = tokenizer(\"This is a sample input\", return_tensors=\"tf\")\n", + "input_pt = tokenizer(\"This is a sample input\", return_tensors=\"pt\")\n", "\n", "# Let's compare the outputs\n", "output_tf, output_pt = model_tf(input_tf), model_pt(**input_pt)\n", @@ -464,7 +464,7 @@ "from transformers import DistilBertModel\n", "\n", "bert_distil = DistilBertModel.from_pretrained('distilbert-base-cased')\n", - "input_pt = tokenizer.encode_plus(\n", + "input_pt = tokenizer(\n", " 'This is a sample input to demonstrate performance of distiled models especially inference time', \n", " return_tensors=\"pt\"\n", ")\n", @@ -514,7 +514,7 @@ "de_bert = BertModel.from_pretrained(\"dbmdz/bert-base-german-cased\")\n", "de_tokenizer = BertTokenizer.from_pretrained(\"dbmdz/bert-base-german-cased\")\n", "\n", - "de_input = de_tokenizer.encode_plus(\n", + "de_input = de_tokenizer(\n", " \"Hugging Face ist eine französische Firma mit Sitz in New-York.\",\n", " return_tensors=\"pt\"\n", ")\n", @@ -559,4 +559,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/notebooks/04-onnx-export.ipynb b/notebooks/04-onnx-export.ipynb index 4666097c30..acac251912 100644 --- a/notebooks/04-onnx-export.ipynb +++ b/notebooks/04-onnx-export.ipynb @@ -248,7 +248,7 @@ "cpu_model = create_model_for_provider(\"onnx/bert-base-cased.onnx\", \"CPUExecutionProvider\")\n", "\n", "# Inputs are provided through numpy array\n", - "model_inputs = tokenizer.encode_plus(\"My name is Bert\", return_tensors=\"pt\")\n", + "model_inputs = tokenizer(\"My name is Bert\", return_tensors=\"pt\")\n", "inputs_onnx = {k: v.cpu().detach().numpy() for k, v in model_inputs.items()}\n", "\n", "# Run the model (None = get all the outputs)\n", diff --git a/src/transformers/convert_graph_to_onnx.py b/src/transformers/convert_graph_to_onnx.py index 5fd0c6c96b..fcbfea1d22 100644 --- a/src/transformers/convert_graph_to_onnx.py +++ b/src/transformers/convert_graph_to_onnx.py @@ -86,7 +86,7 @@ def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List[str], D print("Found {} {} with shape: {}".format("input" if is_input else "output", name, axes)) return axes - tokens = nlp.tokenizer.encode_plus("This is a sample output", return_tensors=framework) + tokens = nlp.tokenizer("This is a sample output", return_tensors=framework) seq_len = tokens.input_ids.shape[-1] outputs = nlp.model(**tokens) if framework == "pt" else nlp.model(tokens) diff --git a/src/transformers/data/datasets/language_modeling.py b/src/transformers/data/datasets/language_modeling.py index 6fae7b55c5..94988a859b 100644 --- a/src/transformers/data/datasets/language_modeling.py +++ b/src/transformers/data/datasets/language_modeling.py @@ -91,7 +91,7 @@ class LineByLineTextDataset(Dataset): with open(file_path, encoding="utf-8") as f: lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())] - batch_encoding = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size) + batch_encoding = tokenizer(lines, add_special_tokens=True, truncation=True, max_length=block_size) self.examples = batch_encoding["input_ids"] def __len__(self): diff --git a/src/transformers/data/processors/glue.py b/src/transformers/data/processors/glue.py index 870817a60e..8a96240486 100644 --- a/src/transformers/data/processors/glue.py +++ b/src/transformers/data/processors/glue.py @@ -137,8 +137,11 @@ def _glue_convert_examples_to_features( labels = [label_from_example(example) for example in examples] - batch_encoding = tokenizer.batch_encode_plus( - [(example.text_a, example.text_b) for example in examples], max_length=max_length, pad_to_max_length=True, + batch_encoding = tokenizer( + [(example.text_a, example.text_b) for example in examples], + max_length=max_length, + padding="max_length", + truncation=True, ) features = [] diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py index 9ca27d7ff4..0c68df6820 100644 --- a/src/transformers/data/processors/squad.py +++ b/src/transformers/data/processors/squad.py @@ -120,7 +120,9 @@ def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_q spans = [] - truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length) + truncated_query = tokenizer.encode( + example.question_text, add_special_tokens=False, truncation=True, max_length=max_query_length + ) sequence_added_tokens = ( tokenizer.max_len - tokenizer.max_len_single_sentence + 1 if "roberta" in str(type(tokenizer)) or "camembert" in str(type(tokenizer)) @@ -131,14 +133,14 @@ def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_q span_doc_tokens = all_doc_tokens while len(spans) * doc_stride < len(all_doc_tokens): - encoded_dict = tokenizer.encode_plus( + encoded_dict = tokenizer.encode_plus( # TODO(thom) update this logic truncated_query if tokenizer.padding_side == "right" else span_doc_tokens, span_doc_tokens if tokenizer.padding_side == "right" else truncated_query, + truncation="only_second" if tokenizer.padding_side == "right" else "only_first", + padding="max_length", max_length=max_seq_length, return_overflowing_tokens=True, - pad_to_max_length=True, stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, - truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first", return_token_type_ids=True, ) @@ -176,7 +178,9 @@ def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_q spans.append(encoded_dict) - if "overflowing_tokens" not in encoded_dict: + if "overflowing_tokens" not in encoded_dict or ( + "overflowing_tokens" in encoded_dict and len(encoded_dict["overflowing_tokens"]) == 0 + ): break span_doc_tokens = encoded_dict["overflowing_tokens"] diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py index b2a3c4f611..29e0d01a78 100644 --- a/src/transformers/file_utils.py +++ b/src/transformers/file_utils.py @@ -278,7 +278,7 @@ PT_MULTIPLE_CHOICE_SAMPLE = r""" >>> choice1 = "It is eaten while held in the hand." >>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1 - >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', pad_to_max_length=True) + >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', padding=True) >>> outputs = model(**{{k: v.unsqueeze(0) for k,v in encoding.items()}}, labels=labels) # batch size is 1 >>> # the linear classifier still needs to be trained @@ -391,7 +391,7 @@ TF_MULTIPLE_CHOICE_SAMPLE = r""" >>> choice0 = "It is eaten with a fork and a knife." >>> choice1 = "It is eaten while held in the hand." - >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='tf', pad_to_max_length=True) + >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='tf', padding=True) >>> inputs = {{k: tf.expand_dims(v, 0) for k, v in encoding.items()}} >>> outputs = model(inputs) # batch size is 1 diff --git a/src/transformers/modeling_albert.py b/src/transformers/modeling_albert.py index cc8390a149..731ee4a7ee 100644 --- a/src/transformers/modeling_albert.py +++ b/src/transformers/modeling_albert.py @@ -402,7 +402,7 @@ ALBERT_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.AlbertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index 75a7345c8e..23d25cfa09 100644 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -579,7 +579,7 @@ BERT_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.BertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_ctrl.py b/src/transformers/modeling_ctrl.py index 3f11109a4d..0bceef1259 100644 --- a/src/transformers/modeling_ctrl.py +++ b/src/transformers/modeling_ctrl.py @@ -251,7 +251,7 @@ CTRL_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.CTRLTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): diff --git a/src/transformers/modeling_distilbert.py b/src/transformers/modeling_distilbert.py index 398b0e2958..cf93c0b1c7 100644 --- a/src/transformers/modeling_distilbert.py +++ b/src/transformers/modeling_distilbert.py @@ -360,7 +360,7 @@ DISTILBERT_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.DistilBertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): @@ -893,7 +893,7 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel): >>> choice1 = "It is eaten while held in the hand." >>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1 - >>> encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True) + >>> encoding = tokenizer([[prompt, choice0], [prompt, choice1]], return_tensors='pt', padding=True) >>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1 >>> # the linear classifier still needs to be trained diff --git a/src/transformers/modeling_electra.py b/src/transformers/modeling_electra.py index e08e487153..050d96909d 100644 --- a/src/transformers/modeling_electra.py +++ b/src/transformers/modeling_electra.py @@ -186,7 +186,7 @@ ELECTRA_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.ElectraTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_flaubert.py b/src/transformers/modeling_flaubert.py index 1fed5b6853..5e5128a0cb 100644 --- a/src/transformers/modeling_flaubert.py +++ b/src/transformers/modeling_flaubert.py @@ -65,7 +65,7 @@ FLAUBERT_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.BertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_gpt2.py b/src/transformers/modeling_gpt2.py index 8bcb8876a9..b839cba162 100644 --- a/src/transformers/modeling_gpt2.py +++ b/src/transformers/modeling_gpt2.py @@ -302,7 +302,7 @@ GPT2_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.GPT2Tokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ diff --git a/src/transformers/modeling_longformer.py b/src/transformers/modeling_longformer.py index 7d2a6978b5..9d869e73a1 100644 --- a/src/transformers/modeling_longformer.py +++ b/src/transformers/modeling_longformer.py @@ -454,7 +454,7 @@ LONGFORMER_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.LonmgformerTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): @@ -970,7 +970,7 @@ class LongformerForQuestionAnswering(BertPreTrainedModel): >>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa") >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" - >>> encoding = tokenizer.encode_plus(question, text, return_tensors="pt") + >>> encoding = tokenizer(question, text, return_tensors="pt") >>> input_ids = encoding["input_ids"] >>> # default is local attention everywhere diff --git a/src/transformers/modeling_mobilebert.py b/src/transformers/modeling_mobilebert.py index 5165d3fa2b..4cbaf3a502 100644 --- a/src/transformers/modeling_mobilebert.py +++ b/src/transformers/modeling_mobilebert.py @@ -678,7 +678,7 @@ MOBILEBERT_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.MobileBertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_openai.py b/src/transformers/modeling_openai.py index 949a6ccd7a..e831594cbd 100644 --- a/src/transformers/modeling_openai.py +++ b/src/transformers/modeling_openai.py @@ -296,7 +296,7 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.OpenAIGPTTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_reformer.py b/src/transformers/modeling_reformer.py index 7b763ebf96..65cfd50345 100644 --- a/src/transformers/modeling_reformer.py +++ b/src/transformers/modeling_reformer.py @@ -1487,7 +1487,7 @@ REFORMER_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.ReformerTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_retribert.py b/src/transformers/modeling_retribert.py index e0395ceb03..8b03a6d0f9 100644 --- a/src/transformers/modeling_retribert.py +++ b/src/transformers/modeling_retribert.py @@ -153,7 +153,7 @@ class RetriBertModel(RetriBertPreTrainedModel): Indices can be obtained using :class:`transformers.RetriBertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask_query (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_roberta.py b/src/transformers/modeling_roberta.py index 7c3f08294d..7a7baea014 100644 --- a/src/transformers/modeling_roberta.py +++ b/src/transformers/modeling_roberta.py @@ -103,7 +103,7 @@ ROBERTA_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.RobertaTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_tf_albert.py b/src/transformers/modeling_tf_albert.py index 9f988c647e..3c2a7bbf98 100644 --- a/src/transformers/modeling_tf_albert.py +++ b/src/transformers/modeling_tf_albert.py @@ -674,7 +674,7 @@ ALBERT_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.AlbertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional, defaults to :obj:`None`): diff --git a/src/transformers/modeling_tf_bert.py b/src/transformers/modeling_tf_bert.py index 55431d1fbb..0bab7699ce 100644 --- a/src/transformers/modeling_tf_bert.py +++ b/src/transformers/modeling_tf_bert.py @@ -664,7 +664,7 @@ BERT_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.BertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): @@ -882,7 +882,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel): prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." next_sentence = "The sky is blue due to the shorter wavelength of blue light." - encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='tf') + encoding = tokenizer(prompt, next_sentence, return_tensors='tf') logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0] assert logits[0][0] < logits[0][1] # the next sentence was random diff --git a/src/transformers/modeling_tf_ctrl.py b/src/transformers/modeling_tf_ctrl.py index 4bc7cf0910..96c2d0e2ad 100644 --- a/src/transformers/modeling_tf_ctrl.py +++ b/src/transformers/modeling_tf_ctrl.py @@ -437,7 +437,7 @@ CTRL_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.CTRLTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): diff --git a/src/transformers/modeling_tf_distilbert.py b/src/transformers/modeling_tf_distilbert.py index bf06335b20..e9fe573bde 100644 --- a/src/transformers/modeling_tf_distilbert.py +++ b/src/transformers/modeling_tf_distilbert.py @@ -545,7 +545,7 @@ DISTILBERT_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.BertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_tf_electra.py b/src/transformers/modeling_tf_electra.py index 1aae20b56f..b77c04e4d2 100644 --- a/src/transformers/modeling_tf_electra.py +++ b/src/transformers/modeling_tf_electra.py @@ -339,7 +339,7 @@ ELECTRA_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.ElectraTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_tf_flaubert.py b/src/transformers/modeling_tf_flaubert.py index 09e1f00df4..d10324de08 100644 --- a/src/transformers/modeling_tf_flaubert.py +++ b/src/transformers/modeling_tf_flaubert.py @@ -60,7 +60,7 @@ FLAUBERT_INPUTS_DOCSTRING = r""" Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers.BertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. diff --git a/src/transformers/modeling_tf_gpt2.py b/src/transformers/modeling_tf_gpt2.py index f2bc63392a..5c4bbd27c6 100644 --- a/src/transformers/modeling_tf_gpt2.py +++ b/src/transformers/modeling_tf_gpt2.py @@ -441,7 +441,7 @@ GPT2_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.GPT2Tokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): diff --git a/src/transformers/modeling_tf_mobilebert.py b/src/transformers/modeling_tf_mobilebert.py index 3178bccfaf..5e68853a1f 100644 --- a/src/transformers/modeling_tf_mobilebert.py +++ b/src/transformers/modeling_tf_mobilebert.py @@ -794,7 +794,7 @@ MOBILEBERT_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.MobileBertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_tf_openai.py b/src/transformers/modeling_tf_openai.py index c254e32751..477e63ee59 100644 --- a/src/transformers/modeling_tf_openai.py +++ b/src/transformers/modeling_tf_openai.py @@ -405,7 +405,7 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.GPT2Tokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_tf_roberta.py b/src/transformers/modeling_tf_roberta.py index 751ca17abc..4148a69065 100644 --- a/src/transformers/modeling_tf_roberta.py +++ b/src/transformers/modeling_tf_roberta.py @@ -156,7 +156,7 @@ ROBERTA_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.RobertaTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_tf_transfo_xl.py b/src/transformers/modeling_tf_transfo_xl.py index ae9accb255..33fd3ba7ff 100644 --- a/src/transformers/modeling_tf_transfo_xl.py +++ b/src/transformers/modeling_tf_transfo_xl.py @@ -694,7 +694,7 @@ TRANSFO_XL_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.TransfoXLTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): diff --git a/src/transformers/modeling_tf_xlm.py b/src/transformers/modeling_tf_xlm.py index 007bb572e6..e912891c21 100644 --- a/src/transformers/modeling_tf_xlm.py +++ b/src/transformers/modeling_tf_xlm.py @@ -555,7 +555,7 @@ XLM_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.BertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_tf_xlnet.py b/src/transformers/modeling_tf_xlnet.py index 80ee28fc78..3ec9659323 100644 --- a/src/transformers/modeling_tf_xlnet.py +++ b/src/transformers/modeling_tf_xlnet.py @@ -778,7 +778,7 @@ XLNET_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.XLNetTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_transfo_xl.py b/src/transformers/modeling_transfo_xl.py index 2e39ef025c..8abd643da2 100644 --- a/src/transformers/modeling_transfo_xl.py +++ b/src/transformers/modeling_transfo_xl.py @@ -609,7 +609,7 @@ TRANSFO_XL_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.TransfoXLTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): diff --git a/src/transformers/modeling_xlm.py b/src/transformers/modeling_xlm.py index 03a1ebe237..2c91e834b5 100644 --- a/src/transformers/modeling_xlm.py +++ b/src/transformers/modeling_xlm.py @@ -259,7 +259,7 @@ XLM_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.BertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_xlnet.py b/src/transformers/modeling_xlnet.py index f6dcd679eb..0b67bafc3b 100644 --- a/src/transformers/modeling_xlnet.py +++ b/src/transformers/modeling_xlnet.py @@ -573,7 +573,7 @@ XLNET_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`transformers.BertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index 0c1e399001..0ed72371a2 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -456,17 +456,14 @@ class Pipeline(_ScikitCompat): """ return {name: tensor.to(self.device) for name, tensor in inputs.items()} - def _parse_and_tokenize(self, *args, pad_to_max_length=True, add_special_tokens=True, **kwargs): + def _parse_and_tokenize(self, *args, padding=True, add_special_tokens=True, **kwargs): """ Parse arguments and tokenize """ # Parse arguments inputs = self._args_parser(*args, **kwargs) - inputs = self.tokenizer.batch_encode_plus( - inputs, - add_special_tokens=add_special_tokens, - return_tensors=self.framework, - pad_to_max_length=pad_to_max_length, + inputs = self.tokenizer( + inputs, add_special_tokens=add_special_tokens, return_tensors=self.framework, padding=padding, ) return inputs @@ -623,10 +620,10 @@ class TextGenerationPipeline(Pipeline): with self.device_placement(): if self.model.__class__.__name__ in ["XLNetLMHeadModel", "TransfoXLLMHeadModel"]: inputs = self._parse_and_tokenize( - self.PADDING_TEXT + prompt_text, pad_to_max_length=False, add_special_tokens=False + self.PADDING_TEXT + prompt_text, padding=False, add_special_tokens=False ) else: - inputs = self._parse_and_tokenize(prompt_text, pad_to_max_length=False, add_special_tokens=False) + inputs = self._parse_and_tokenize(prompt_text, padding=False, add_special_tokens=False) # set input_ids to None to allow empty prompt if inputs["input_ids"].shape[-1] == 0: @@ -920,11 +917,8 @@ class TokenClassificationPipeline(Pipeline): # Manage correct placement of the tensors with self.device_placement(): - tokens = self.tokenizer.encode_plus( - sentence, - return_attention_mask=False, - return_tensors=self.framework, - max_length=self.tokenizer.max_len, + tokens = self.tokenizer( + sentence, return_attention_mask=False, return_tensors=self.framework, truncation=True, ) # Forward @@ -1187,12 +1181,12 @@ class QuestionAnsweringPipeline(Pipeline): examples = self._args_parser(*args, **kwargs) features_list = [ squad_convert_examples_to_features( - [example], - self.tokenizer, - kwargs["max_seq_len"], - kwargs["doc_stride"], - kwargs["max_question_len"], - False, + examples=[example], + tokenizer=self.tokenizer, + max_seq_length=kwargs["max_seq_len"], + doc_stride=kwargs["doc_stride"], + max_query_length=kwargs["max_question_len"], + is_training=False, tqdm_enabled=False, ) for example in examples @@ -1431,11 +1425,11 @@ class SummarizationPipeline(Pipeline): ), "Please make sure that the tokenizer has a pad_token_id when using a batch input" documents = ([prefix + document for document in documents[0]],) - pad_to_max_length = True + padding = True elif isinstance(documents[0], str): documents = (prefix + documents[0],) - pad_to_max_length = False + padding = False else: raise ValueError( " `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format( @@ -1444,7 +1438,7 @@ class SummarizationPipeline(Pipeline): ) with self.device_placement(): - inputs = self._parse_and_tokenize(*documents, pad_to_max_length=pad_to_max_length) + inputs = self._parse_and_tokenize(*documents, padding=padding) if self.framework == "pt": inputs = self.ensure_tensor_on_device(**inputs) @@ -1549,11 +1543,11 @@ class TranslationPipeline(Pipeline): self.tokenizer.pad_token_id is not None ), "Please make sure that the tokenizer has a pad_token_id when using a batch input" args = ([prefix + text for text in args[0]],) - pad_to_max_length = True + padding = True elif isinstance(args[0], str): args = (prefix + args[0],) - pad_to_max_length = False + padding = False else: raise ValueError( " `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format( @@ -1562,7 +1556,7 @@ class TranslationPipeline(Pipeline): ) with self.device_placement(): - inputs = self._parse_and_tokenize(*args, pad_to_max_length=pad_to_max_length) + inputs = self._parse_and_tokenize(*args, padding=padding) if self.framework == "pt": inputs = self.ensure_tensor_on_device(**inputs) diff --git a/src/transformers/tokenization_albert.py b/src/transformers/tokenization_albert.py index f33ce15f78..0b7ca36e81 100644 --- a/src/transformers/tokenization_albert.py +++ b/src/transformers/tokenization_albert.py @@ -263,7 +263,7 @@ class AlbertTokenizer(PreTrainedTokenizer): ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + special tokens using the tokenizer ``prepare_for_model`` method. Args: token_ids_0 (:obj:`List[int]`): diff --git a/src/transformers/tokenization_bart.py b/src/transformers/tokenization_bart.py index 78d5a1474f..e3157e9eec 100644 --- a/src/transformers/tokenization_bart.py +++ b/src/transformers/tokenization_bart.py @@ -125,7 +125,7 @@ class MBartTokenizer(XLMRobertaTokenizer): return self.sp_model.IdToPiece(index - self.fairseq_offset) def set_lang(self, lang: str) -> None: - """Set the current language code in order to call batch_encode_plus properly.""" + """Set the current language code in order to call tokenizer properly.""" self.cur_lang_code = self.lang_code_to_id[lang] def prepare_translation_batch( diff --git a/src/transformers/tokenization_bert.py b/src/transformers/tokenization_bert.py index c503e8504f..b168fe96f4 100644 --- a/src/transformers/tokenization_bert.py +++ b/src/transformers/tokenization_bert.py @@ -263,7 +263,7 @@ class BertTokenizer(PreTrainedTokenizer): ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + special tokens using the tokenizer ``prepare_for_model`` method. Args: token_ids_0 (:obj:`List[int]`): diff --git a/src/transformers/tokenization_camembert.py b/src/transformers/tokenization_camembert.py index 5b8fe7ab00..84ecd2fef3 100644 --- a/src/transformers/tokenization_camembert.py +++ b/src/transformers/tokenization_camembert.py @@ -171,7 +171,7 @@ class CamembertTokenizer(PreTrainedTokenizer): ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + special tokens using the tokenizer ``prepare_for_model`` method. Args: token_ids_0 (:obj:`List[int]`): diff --git a/src/transformers/tokenization_roberta.py b/src/transformers/tokenization_roberta.py index 19b482976c..41abab03ce 100644 --- a/src/transformers/tokenization_roberta.py +++ b/src/transformers/tokenization_roberta.py @@ -193,7 +193,7 @@ class RobertaTokenizer(GPT2Tokenizer): ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + special tokens using the tokenizer ``prepare_for_model`` method. Args: token_ids_0 (:obj:`List[int]`): diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index 4a69ecc725..15fb58bff0 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -820,7 +820,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + special tokens using the tokenizer ``prepare_for_model`` method. Args: token_ids_0: list of ids (must not contain special tokens) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 13e24bf02c..db9b4e45e7 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1583,6 +1583,42 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): If the sequences are provided as list of strings (pretokenized), you must set `is_pretokenized=True` (to lift the ambiguity with a batch of sequences) """ + # Input type checking for clearer error + assert isinstance(text, str) or ( + isinstance(text, (list, tuple)) + and ( + len(text) == 0 + or ( + isinstance(text[0], str) + or (isinstance(text[0], (list, tuple)) and (len(text[0]) == 0 or isinstance(text[0][0], str))) + ) + ) + ), ( + "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " + "or `List[List[str]]` (batch of pretokenized examples)." + ) + + assert ( + text_pair is None + or isinstance(text_pair, str) + or ( + isinstance(text_pair, (list, tuple)) + and ( + len(text_pair) == 0 + or ( + isinstance(text_pair[0], str) + or ( + isinstance(text_pair[0], (list, tuple)) + and (len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str)) + ) + ) + ) + ) + ), ( + "text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " + "or `List[List[str]]` (batch of pretokenized examples)." + ) + is_batched = bool( (not is_pretokenized and isinstance(text, (list, tuple))) or (is_pretokenized and isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))) diff --git a/src/transformers/tokenization_xlm.py b/src/transformers/tokenization_xlm.py index 0ca13344ac..0a62468ea5 100644 --- a/src/transformers/tokenization_xlm.py +++ b/src/transformers/tokenization_xlm.py @@ -882,7 +882,7 @@ class XLMTokenizer(PreTrainedTokenizer): ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + special tokens using the tokenizer ``prepare_for_model`` methods. Args: token_ids_0 (:obj:`List[int]`): diff --git a/src/transformers/tokenization_xlm_roberta.py b/src/transformers/tokenization_xlm_roberta.py index f2f5f76c79..787be9b064 100644 --- a/src/transformers/tokenization_xlm_roberta.py +++ b/src/transformers/tokenization_xlm_roberta.py @@ -206,7 +206,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + special tokens using the tokenizer ``prepare_for_model`` methods. Args: token_ids_0 (:obj:`List[int]`): diff --git a/src/transformers/tokenization_xlnet.py b/src/transformers/tokenization_xlnet.py index 93ef2d2bb7..c3689b21d4 100644 --- a/src/transformers/tokenization_xlnet.py +++ b/src/transformers/tokenization_xlnet.py @@ -267,7 +267,7 @@ class XLNetTokenizer(PreTrainedTokenizer): ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + special tokens using the tokenizer ``prepare_for_model`` methods. Args: token_ids_0 (:obj:`List[int]`): diff --git a/templates/adding_a_new_model/tokenization_xxx.py b/templates/adding_a_new_model/tokenization_xxx.py index 6a96b0ff9d..91dc7f8c0b 100644 --- a/templates/adding_a_new_model/tokenization_xxx.py +++ b/templates/adding_a_new_model/tokenization_xxx.py @@ -171,7 +171,7 @@ class XxxTokenizer(PreTrainedTokenizer): def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + special tokens using the tokenizer ``prepare_for_model`` methods. Args: token_ids_0: list of ids (must not contain special tokens) diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py index 3cafb3a40d..209abbb211 100644 --- a/tests/test_modeling_bart.py +++ b/tests/test_modeling_bart.py @@ -626,9 +626,9 @@ class BartModelIntegrationTests(unittest.TestCase): PGE_ARTICLE = """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow.""" EXPECTED_SUMMARY = "California's largest power company has begun shutting off power to tens of thousands of homes and businesses in the state." - dct = tok.batch_encode_plus([PGE_ARTICLE], max_length=1024, pad_to_max_length=True, return_tensors="pt",).to( - torch_device - ) + dct = tok.batch_encode_plus( + [PGE_ARTICLE], max_length=1024, padding="max_length", truncation=True, return_tensors="pt", + ).to(torch_device) hypotheses_batch = model.generate( input_ids=dct["input_ids"], @@ -672,7 +672,8 @@ class BartModelIntegrationTests(unittest.TestCase): dct = tok.batch_encode_plus( [FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY], max_length=1024, - pad_to_max_length=True, + padding="max_length", + truncation=True, return_tensors="pt", ) diff --git a/tests/test_modeling_t5.py b/tests/test_modeling_t5.py index a5d7a1a4d2..39254c4f47 100644 --- a/tests/test_modeling_t5.py +++ b/tests/test_modeling_t5.py @@ -375,10 +375,11 @@ class T5ModelIntegrationTests(unittest.TestCase): summarization_config = task_specific_config.get("summarization", {}) model.config.update(summarization_config) - dct = tok.batch_encode_plus( + dct = tok( [model.config.prefix + x for x in [FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY]], max_length=512, - pad_to_max_length=True, + padding="max_length", + truncation=True, return_tensors="pt", ) self.assertEqual(512, dct["input_ids"].shape[1]) diff --git a/tests/test_modeling_tf_t5.py b/tests/test_modeling_tf_t5.py index 28cf60e461..2f996fc90c 100644 --- a/tests/test_modeling_tf_t5.py +++ b/tests/test_modeling_tf_t5.py @@ -276,10 +276,11 @@ class TFT5ModelIntegrationTests(unittest.TestCase): summarization_config = task_specific_config.get("summarization", {}) model.config.update(summarization_config) - dct = tok.batch_encode_plus( + dct = tok( [model.config.prefix + x for x in [FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY]], max_length=512, - pad_to_max_length=True, + padding="max_length", + truncation=True, return_tensors="tf", ) self.assertEqual(512, dct["input_ids"].shape[1])