[tokenizers] Updates data processors, docstring, examples and model cards to the new API (#5308)

* remove references to old API in docstring - update data processors * style * fix tests - better type checking error messages * better type checking * include awesome fix by @LysandreJik for #5310 * updated doc and examples
2020-06-26 19:48:14 +02:00
parent fd405e9a93
commit 601d4d699c
73 changed files with 180 additions and 138 deletions
--- a/examples/seq2seq/utils.py
+++ b/examples/seq2seq/utils.py
@@ -41,12 +41,12 @@ def encode_file(
    assert lns, f"found empty file at {data_path}"
    examples = []
    for text in tqdm(lns, desc=f"Tokenizing {data_path.name}"):
-        tokenized = tokenizer.batch_encode_plus(
+        tokenized = tokenizer(
            [text],
            max_length=max_length,
-            pad_to_max_length=pad_to_max_length,
-            add_prefix_space=True,
+            padding="max_length" if pad_to_max_length else None,
            truncation=True,
+            add_prefix_space=True,
            return_tensors=return_tensors,
        )
        assert tokenized.input_ids.shape[1] == max_length