[tokenizers] Updates data processors, docstring, examples and model cards to the new API (#5308)

* remove references to old API in docstring - update data processors * style * fix tests - better type checking error messages * better type checking * include awesome fix by @LysandreJik for #5310 * updated doc and examples
2020-06-26 19:48:14 +02:00
parent fd405e9a93
commit 601d4d699c
73 changed files with 180 additions and 138 deletions
--- a/tests/test_modeling_t5.py
+++ b/tests/test_modeling_t5.py
@@ -375,10 +375,11 @@ class T5ModelIntegrationTests(unittest.TestCase):
        summarization_config = task_specific_config.get("summarization", {})
        model.config.update(summarization_config)

-        dct = tok.batch_encode_plus(
+        dct = tok(
            [model.config.prefix + x for x in [FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY]],
            max_length=512,
-            pad_to_max_length=True,
+            padding="max_length",
+            truncation=True,
            return_tensors="pt",
        )
        self.assertEqual(512, dct["input_ids"].shape[1])