[T5 Tokenizer] Model has no fixed position ids - there is no hardcode… (#16990)

* [T5 Tokenizer] Model has no fixed position ids - there is no hardcoded max length * [T5 Tokenizer] Model has no fixed position ids - there is no hardcoded max length * correct t5 tokenizer * correct t5 tokenizer * fix test * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * finish Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2022-05-02 21:27:34 +02:00
parent 1073f00d4e
commit 31616b8d61
4 changed files with 70 additions and 1 deletions
--- a/tests/t5/test_tokenization_t5.py
+++ b/tests/t5/test_tokenization_t5.py
@@ -223,6 +223,9 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
            ["I am a small frog" * 1000, "I am a small frog"], padding=True, truncation=True, return_tensors=FRAMEWORK
        )
        self.assertIsInstance(batch, BatchEncoding)
+        # Since T5 does NOT have a max input length,
+        # this test should be changed to the following in Transformers v5:
+        # self.assertEqual(batch.input_ids.shape, (2, 8001))
        self.assertEqual(batch.input_ids.shape, (2, 512))

    def test_eos_in_input(self):
@@ -361,6 +364,13 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                    ),
                )

+    # overwritten from `test_tokenization_common` since T5 has no max length
+    def test_pretrained_model_lists(self):
+        # We should have at least one default checkpoint for each tokenizer
+        # We should specify the max input length as well (used in some part to list the pretrained checkpoints)
+        self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1)
+        self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1)
+
    @slow
    def test_tokenizer_integration(self):
        # fmt: off