[T5 Tokenizer] Model has no fixed position ids - there is no hardcode… (#16990)

* [T5 Tokenizer] Model has no fixed position ids - there is no hardcoded max length

* [T5 Tokenizer] Model has no fixed position ids - there is no hardcoded max length

* correct t5 tokenizer

* correct t5 tokenizer

* fix test

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* finish

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
Patrick von Platen
2022-05-02 21:27:34 +02:00
committed by GitHub
parent 1073f00d4e
commit 31616b8d61
4 changed files with 70 additions and 1 deletions

View File

@@ -223,6 +223,9 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
["I am a small frog" * 1000, "I am a small frog"], padding=True, truncation=True, return_tensors=FRAMEWORK
)
self.assertIsInstance(batch, BatchEncoding)
# Since T5 does NOT have a max input length,
# this test should be changed to the following in Transformers v5:
# self.assertEqual(batch.input_ids.shape, (2, 8001))
self.assertEqual(batch.input_ids.shape, (2, 512))
def test_eos_in_input(self):
@@ -361,6 +364,13 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
),
)
# overwritten from `test_tokenization_common` since T5 has no max length
def test_pretrained_model_lists(self):
# We should have at least one default checkpoint for each tokenizer
# We should specify the max input length as well (used in some part to list the pretrained checkpoints)
self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1)
self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1)
@slow
def test_tokenizer_integration(self):
# fmt: off