[tokenizers] Updates data processors, docstring, examples and model cards to the new API (#5308)

* remove references to old API in docstring - update data processors * style * fix tests - better type checking error messages * better type checking * include awesome fix by @LysandreJik for #5310 * updated doc and examples
2020-06-26 19:48:14 +02:00
parent fd405e9a93
commit 601d4d699c
73 changed files with 180 additions and 138 deletions
--- a/examples/adversarial/utils_hans.py
+++ b/examples/adversarial/utils_hans.py
@@ -298,12 +298,13 @@ def hans_convert_examples_to_features(
        if ex_index % 10000 == 0:
            logger.info("Writing example %d" % (ex_index))

-        inputs = tokenizer.encode_plus(
+        inputs = tokenizer(
            example.text_a,
            example.text_b,
            add_special_tokens=True,
            max_length=max_length,
-            pad_to_max_length=True,
+            padding="max_length",
+            truncation=True,
            return_overflowing_tokens=True,
        )

--- a/examples/longform-qa/eli5_utils.py
+++ b/examples/longform-qa/eli5_utils.py
@@ -193,12 +193,12 @@ def make_qa_retriever_model(model_name="google/bert_uncased_L-8_H-512_A-8", from
 def make_qa_retriever_batch(qa_list, tokenizer, max_len=64, device="cuda:0"):
    q_ls = [q for q, a in qa_list]
    a_ls = [a for q, a in qa_list]
-    q_toks = tokenizer.batch_encode_plus(q_ls, max_length=max_len, pad_to_max_length=True)
+    q_toks = tokenizer(q_ls, max_length=max_len, padding="max_length", truncation=True)
    q_ids, q_mask = (
        torch.LongTensor(q_toks["input_ids"]).to(device),
        torch.LongTensor(q_toks["attention_mask"]).to(device),
    )
-    a_toks = tokenizer.batch_encode_plus(a_ls, max_length=max_len, pad_to_max_length=True)
+    a_toks = tokenizer(a_ls, max_length=max_len, padding="max_length", truncation=True)
    a_ids, a_mask = (
        torch.LongTensor(a_toks["input_ids"]).to(device),
        torch.LongTensor(a_toks["attention_mask"]).to(device),
@@ -375,12 +375,12 @@ def make_qa_s2s_model(model_name="facebook/bart-large", from_file=None, device="
 def make_qa_s2s_batch(qa_list, tokenizer, max_len=64, max_a_len=360, device="cuda:0"):
    q_ls = [q for q, a in qa_list]
    a_ls = [a for q, a in qa_list]
-    q_toks = tokenizer.batch_encode_plus(q_ls, max_length=max_len, pad_to_max_length=True)
+    q_toks = tokenizer(q_ls, max_length=max_len, padding="max_length", truncation=True)
    q_ids, q_mask = (
        torch.LongTensor(q_toks["input_ids"]).to(device),
        torch.LongTensor(q_toks["attention_mask"]).to(device),
    )
-    a_toks = tokenizer.batch_encode_plus(a_ls, max_length=min(max_len, max_a_len), pad_to_max_length=True)
+    a_toks = tokenizer(a_ls, max_length=min(max_len, max_a_len), padding="max_length", truncation=True)
    a_ids, a_mask = (
        torch.LongTensor(a_toks["input_ids"]).to(device),
        torch.LongTensor(a_toks["attention_mask"]).to(device),
@@ -531,7 +531,7 @@ def qa_s2s_generate(
 # ELI5-trained retrieval model usage
 ###############
 def embed_passages_for_retrieval(passages, tokenizer, qa_embedder, max_length=128, device="cuda:0"):
-    a_toks = tokenizer.batch_encode_plus(passages, max_length=max_length, pad_to_max_length=True)
+    a_toks = tokenizer(passages, max_length=max_length, padding="max_length", truncation=True)
    a_ids, a_mask = (
        torch.LongTensor(a_toks["input_ids"]).to(device),
        torch.LongTensor(a_toks["attention_mask"]).to(device),
@@ -542,7 +542,7 @@ def embed_passages_for_retrieval(passages, tokenizer, qa_embedder, max_length=12


 def embed_questions_for_retrieval(q_ls, tokenizer, qa_embedder, device="cuda:0"):
-    q_toks = tokenizer.batch_encode_plus(q_ls, max_length=128, pad_to_max_length=True)
+    q_toks = tokenizer(q_ls, max_length=128, padding="max_length", truncation=True)
    q_ids, q_mask = (
        torch.LongTensor(q_toks["input_ids"]).to(device),
        torch.LongTensor(q_toks["attention_mask"]).to(device),
--- a/examples/movement-pruning/emmental/modeling_bert_masked.py
+++ b/examples/movement-pruning/emmental/modeling_bert_masked.py
@@ -424,7 +424,7 @@ MASKED_BERT_INPUTS_DOCSTRING = r"""

            Indices can be obtained using :class:`transformers.BertTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
+            :func:`transformers.PreTrainedTokenizer.__call__` for details.

            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
--- a/examples/multiple-choice/utils_multiple_choice.py
+++ b/examples/multiple-choice/utils_multiple_choice.py
@@ -510,12 +510,13 @@ def convert_examples_to_features(
            else:
                text_b = example.question + " " + ending

-            inputs = tokenizer.encode_plus(
+            inputs = tokenizer(
                text_a,
                text_b,
                add_special_tokens=True,
                max_length=max_length,
-                pad_to_max_length=True,
+                padding="max_length",
+                truncation=True,
                return_overflowing_tokens=True,
            )
            if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0:
--- a/examples/seq2seq/run_eval.py
+++ b/examples/seq2seq/run_eval.py
@@ -45,9 +45,9 @@ def generate_summaries_or_translations(
    for batch in tqdm(list(chunks(examples, batch_size))):
        if "t5" in model_name:
            batch = [model.config.prefix + text for text in batch]
-        batch = tokenizer.batch_encode_plus(
-            batch, max_length=1024, return_tensors="pt", truncation=True, pad_to_max_length=True
-        ).to(device)
+        batch = tokenizer(batch, max_length=1024, return_tensors="pt", truncation=True, padding="max_length").to(
+            device
+        )
        summaries = model.generate(**batch, **gen_kwargs)
        dec = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False)
        for hypothesis in dec:
--- a/examples/seq2seq/utils.py
+++ b/examples/seq2seq/utils.py
@@ -41,12 +41,12 @@ def encode_file(
    assert lns, f"found empty file at {data_path}"
    examples = []
    for text in tqdm(lns, desc=f"Tokenizing {data_path.name}"):
-        tokenized = tokenizer.batch_encode_plus(
+        tokenized = tokenizer(
            [text],
            max_length=max_length,
-            pad_to_max_length=pad_to_max_length,
-            add_prefix_space=True,
+            padding="max_length" if pad_to_max_length else None,
            truncation=True,
+            add_prefix_space=True,
            return_tensors=return_tensors,
        )
        assert tokenized.input_ids.shape[1] == max_length