[tokenizers] Updates data processors, docstring, examples and model cards to the new API (#5308)

* remove references to old API in docstring - update data processors * style * fix tests - better type checking error messages * better type checking * include awesome fix by @LysandreJik for #5310 * updated doc and examples
2020-06-26 19:48:14 +02:00
parent fd405e9a93
commit 601d4d699c
73 changed files with 180 additions and 138 deletions
--- a/examples/seq2seq/run_eval.py
+++ b/examples/seq2seq/run_eval.py
@@ -45,9 +45,9 @@ def generate_summaries_or_translations(
    for batch in tqdm(list(chunks(examples, batch_size))):
        if "t5" in model_name:
            batch = [model.config.prefix + text for text in batch]
-        batch = tokenizer.batch_encode_plus(
-            batch, max_length=1024, return_tensors="pt", truncation=True, pad_to_max_length=True
-        ).to(device)
+        batch = tokenizer(batch, max_length=1024, return_tensors="pt", truncation=True, padding="max_length").to(
+            device
+        )
        summaries = model.generate(**batch, **gen_kwargs)
        dec = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False)
        for hypothesis in dec:
--- a/examples/seq2seq/utils.py
+++ b/examples/seq2seq/utils.py
@@ -41,12 +41,12 @@ def encode_file(
    assert lns, f"found empty file at {data_path}"
    examples = []
    for text in tqdm(lns, desc=f"Tokenizing {data_path.name}"):
-        tokenized = tokenizer.batch_encode_plus(
+        tokenized = tokenizer(
            [text],
            max_length=max_length,
-            pad_to_max_length=pad_to_max_length,
-            add_prefix_space=True,
+            padding="max_length" if pad_to_max_length else None,
            truncation=True,
+            add_prefix_space=True,
            return_tensors=return_tensors,
        )
        assert tokenized.input_ids.shape[1] == max_length