Tokenizers should be framework agnostic (#8599)

* Tokenizers should be framework agnostic * Run the slow tests * Not testing * Fix documentation * Apply suggestions from code review Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
2020-11-17 14:03:03 -05:00
parent 7f3b41a306
commit 3095ee9dab
28 changed files with 73 additions and 177 deletions
--- a/tests/test_modeling_marian.py
+++ b/tests/test_modeling_marian.py
@@ -132,9 +132,9 @@ class MarianIntegrationTest(unittest.TestCase):
        self.assertListEqual(self.expected_text, generated_words)

    def translate_src_text(self, **tokenizer_kwargs):
-        model_inputs = self.tokenizer.prepare_seq2seq_batch(src_texts=self.src_text, **tokenizer_kwargs).to(
-            torch_device
-        )
+        model_inputs = self.tokenizer.prepare_seq2seq_batch(
+            src_texts=self.src_text, return_tensors="pt", **tokenizer_kwargs
+        ).to(torch_device)
        self.assertEqual(self.model.device, model_inputs.input_ids.device)
        generated_ids = self.model.generate(
            model_inputs.input_ids, attention_mask=model_inputs.attention_mask, num_beams=2, max_length=128
@@ -151,7 +151,9 @@ class TestMarian_EN_DE_More(MarianIntegrationTest):
        src, tgt = ["I am a small frog"], ["Ich bin ein kleiner Frosch."]
        expected_ids = [38, 121, 14, 697, 38848, 0]

-        model_inputs: dict = self.tokenizer.prepare_seq2seq_batch(src, tgt_texts=tgt).to(torch_device)
+        model_inputs: dict = self.tokenizer.prepare_seq2seq_batch(src, tgt_texts=tgt, return_tensors="pt").to(
+            torch_device
+        )

        self.assertListEqual(expected_ids, model_inputs.input_ids[0].tolist())

@@ -171,12 +173,16 @@ class TestMarian_EN_DE_More(MarianIntegrationTest):

    def test_unk_support(self):
        t = self.tokenizer
-        ids = t.prepare_seq2seq_batch(["||"]).to(torch_device).input_ids[0].tolist()
+        ids = t.prepare_seq2seq_batch(["||"], return_tensors="pt").to(torch_device).input_ids[0].tolist()
        expected = [t.unk_token_id, t.unk_token_id, t.eos_token_id]
        self.assertEqual(expected, ids)

    def test_pad_not_split(self):
-        input_ids_w_pad = self.tokenizer.prepare_seq2seq_batch(["I am a small frog <pad>"]).input_ids[0].tolist()
+        input_ids_w_pad = (
+            self.tokenizer.prepare_seq2seq_batch(["I am a small frog <pad>"], return_tensors="pt")
+            .input_ids[0]
+            .tolist()
+        )
        expected_w_pad = [38, 121, 14, 697, 38848, self.tokenizer.pad_token_id, 0]  # pad
        self.assertListEqual(expected_w_pad, input_ids_w_pad)

@@ -294,7 +300,7 @@ class TestMarian_en_ROMANCE(MarianIntegrationTest):
        normalized = self.tokenizer.normalize("")
        self.assertIsInstance(normalized, str)
        with self.assertRaises(ValueError):
-            self.tokenizer.prepare_seq2seq_batch([""])
+            self.tokenizer.prepare_seq2seq_batch([""], return_tensors="pt")

    @slow
    def test_pipeline(self):