Adding a test for multibytes unicode. (#13447)

* Adding a test for multibytes unicode. * Adding some accents. * Making sure decoding works. * Make tests passing by being cheesy.
2021-09-06 16:11:23 +02:00
parent 607611f240
commit cf4eb8b3f9
2 changed files with 40 additions and 7 deletions
--- a/tests/test_tokenization_byt5.py
+++ b/tests/test_tokenization_byt5.py
@@ -56,6 +56,27 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        batch_without_eos_added = tokenizer(["hi", "I went to the gym", ""])
        self.assertListEqual(batch_with_eos_added["input_ids"], batch_without_eos_added["input_ids"])

+    def test_multibytes_char(self):
+        tokenizer = self.t5_base_tokenizer
+        src_text = "Unicode €."
+        encoded = tokenizer(src_text)
+        encoded_ids = [88, 113, 108, 102, 114, 103, 104, 35, 229, 133, 175, 49, 1]
+        self.assertEqual(encoded["input_ids"], encoded_ids)
+
+        # decoding
+        decoded = tokenizer.decode(encoded_ids)
+        self.assertEqual(decoded, "Unicode €.</s>")
+
+        encoded = tokenizer("e è é ê ë")
+        encoded_ids = [104, 35, 198, 171, 35, 198, 172, 35, 198, 173, 35, 198, 174, 1]
+        self.assertEqual(encoded["input_ids"], encoded_ids)
+        # decoding
+        decoded = tokenizer.decode(encoded_ids)
+        self.assertEqual(decoded, "e è é ê ë</s>")
+
+        # encode/decode, but with `encode` instead of `__call__`
+        self.assertEqual(tokenizer.decode(tokenizer.encode("e è é ê ë")), "e è é ê ë</s>")
+
    def test_prepare_batch_integration(self):
        tokenizer = self.t5_base_tokenizer
        src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."]