Tokenizers should be framework agnostic (#8599)

* Tokenizers should be framework agnostic * Run the slow tests * Not testing * Fix documentation * Apply suggestions from code review Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
2020-11-17 14:03:03 -05:00
parent 7f3b41a306
commit 3095ee9dab
28 changed files with 73 additions and 177 deletions
--- a/tests/test_tokenization_pegasus.py
+++ b/tests/test_tokenization_pegasus.py
@@ -61,7 +61,9 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    def test_pegasus_large_seq2seq_truncation(self):
        src_texts = ["This is going to be way too long." * 150, "short example"]
        tgt_texts = ["not super long but more than 5 tokens", "tiny"]
-        batch = self.pegasus_large_tokenizer.prepare_seq2seq_batch(src_texts, tgt_texts=tgt_texts, max_target_length=5)
+        batch = self.pegasus_large_tokenizer.prepare_seq2seq_batch(
+            src_texts, tgt_texts=tgt_texts, max_target_length=5, return_tensors="pt"
+        )
        assert batch.input_ids.shape == (2, 1024)
        assert batch.attention_mask.shape == (2, 1024)
        assert "labels" in batch  # because tgt_texts was specified