From 31616b8d613dcb7ac69b562d51b42d0db379f72f Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 2 May 2022 21:27:34 +0200 Subject: [PATCH] =?UTF-8?q?[T5=20Tokenizer]=20Model=20has=20no=20fixed=20p?= =?UTF-8?q?osition=20ids=20-=20there=20is=20no=20hardcode=E2=80=A6=20(#169?= =?UTF-8?q?90)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [T5 Tokenizer] Model has no fixed position ids - there is no hardcoded max length * [T5 Tokenizer] Model has no fixed position ids - there is no hardcoded max length * correct t5 tokenizer * correct t5 tokenizer * fix test * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * finish Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- src/transformers/models/t5/tokenization_t5.py | 20 ++++++++++++++++++ .../models/t5/tokenization_t5_fast.py | 21 +++++++++++++++++++ src/transformers/tokenization_utils_base.py | 20 +++++++++++++++++- tests/t5/test_tokenization_t5.py | 10 +++++++++ 4 files changed, 70 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py index a356aa70c1..09414ae407 100644 --- a/src/transformers/models/t5/tokenization_t5.py +++ b/src/transformers/models/t5/tokenization_t5.py @@ -41,6 +41,8 @@ PRETRAINED_VOCAB_FILES_MAP = { } } + +# TODO(PVP) - this should be removed in Transformers v5 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "t5-small": 512, "t5-base": 512, @@ -151,6 +153,24 @@ class T5Tokenizer(PreTrainedTokenizer): self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(vocab_file) + @staticmethod + def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length): + if pretrained_model_name_or_path in T5Tokenizer.max_model_input_sizes: + deprecated_max_model_length = T5Tokenizer.max_model_input_sizes[pretrained_model_name_or_path] + if init_max_model_length is not None and init_max_model_length != max_model_length: + return init_max_model_length + elif init_max_model_length is None: + warnings.warn( + f"This tokenizer was incorrectly instantiated with a model max length of {deprecated_max_model_length} which will be corrected in Transformers v5.\n" + f"For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.\n" + f"- Be aware that you SHOULD NOT rely on {pretrained_model_name_or_path} automatically truncating your input to {deprecated_max_model_length} when padding/encoding.\n" + f"- If you want to encode/pad to sequences longer than {deprecated_max_model_length} you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.\n" + f"- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.", + FutureWarning, + ) + + return max_model_length + @property def vocab_size(self): return self.sp_model.get_piece_size() + self._extra_ids diff --git a/src/transformers/models/t5/tokenization_t5_fast.py b/src/transformers/models/t5/tokenization_t5_fast.py index 4612868236..77a86810b3 100644 --- a/src/transformers/models/t5/tokenization_t5_fast.py +++ b/src/transformers/models/t5/tokenization_t5_fast.py @@ -16,6 +16,7 @@ import os +import warnings from shutil import copyfile from typing import List, Optional, Tuple @@ -50,6 +51,8 @@ PRETRAINED_VOCAB_FILES_MAP = { }, } + +# TODO(PVP) - this should be removed in Transformers v5 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "t5-small": 512, "t5-base": 512, @@ -142,6 +145,24 @@ class T5TokenizerFast(PreTrainedTokenizerFast): self.can_save_slow_tokenizer = False if not self.vocab_file else True self._extra_ids = extra_ids + @staticmethod + def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length): + if pretrained_model_name_or_path in T5TokenizerFast.max_model_input_sizes: + deprecated_max_model_length = T5TokenizerFast.max_model_input_sizes[pretrained_model_name_or_path] + if init_max_model_length is not None and init_max_model_length != max_model_length: + return init_max_model_length + elif init_max_model_length is None: + warnings.warn( + f"This tokenizer was incorrectly instantiated with a model max length of {deprecated_max_model_length} which will be corrected in Transformers v5.\n" + f"For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.\n" + f"- Be aware that you SHOULD NOT rely on {pretrained_model_name_or_path} automatically truncating your input to {deprecated_max_model_length} when padding/encoding.\n" + f"- If you want to encode/pad to sequences longer than {deprecated_max_model_length} you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.\n" + f"- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.", + FutureWarning, + ) + + return max_model_length + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: if not self.can_save_slow_tokenizer: raise ValueError( diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index f587cc060d..43d37e67cc 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1899,9 +1899,19 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): if pretrained_model_name_or_path in cls.max_model_input_sizes: # if we're using a pretrained model, ensure the tokenizer # wont index sequences longer than the number of positional embeddings + model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path] if model_max_length is not None and isinstance(model_max_length, (int, float)): - init_kwargs["model_max_length"] = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length) + + model_max_length = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length) + # TODO(PVP) - uncomment following line in Transformers v5 + # init_kwargs["model_max_length"] = model_max_length + # TODO(PVP) - remove in Transformers v5 + # --- + init_kwargs["model_max_length"] = cls._eventually_correct_t5_max_length( + pretrained_model_name_or_path, model_max_length, init_kwargs.get("model_max_length") + ) + # --- # Merge resolved_vocab_files arguments in init_kwargs. added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None) @@ -1983,6 +1993,14 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): return tokenizer + @staticmethod + def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length): + # This method should be deleted in Transformers v5 + # Its only purpose is to potentially throw a warning + # that incorrectly defined max lengths of T5's tokenizer are used + # which we will correct in Transformers v5. + return max_model_length + def save_pretrained( self, save_directory: Union[str, os.PathLike], diff --git a/tests/t5/test_tokenization_t5.py b/tests/t5/test_tokenization_t5.py index 2deaa21f3a..6f5b0f1198 100644 --- a/tests/t5/test_tokenization_t5.py +++ b/tests/t5/test_tokenization_t5.py @@ -223,6 +223,9 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ["I am a small frog" * 1000, "I am a small frog"], padding=True, truncation=True, return_tensors=FRAMEWORK ) self.assertIsInstance(batch, BatchEncoding) + # Since T5 does NOT have a max input length, + # this test should be changed to the following in Transformers v5: + # self.assertEqual(batch.input_ids.shape, (2, 8001)) self.assertEqual(batch.input_ids.shape, (2, 512)) def test_eos_in_input(self): @@ -361,6 +364,13 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ), ) + # overwritten from `test_tokenization_common` since T5 has no max length + def test_pretrained_model_lists(self): + # We should have at least one default checkpoint for each tokenizer + # We should specify the max input length as well (used in some part to list the pretrained checkpoints) + self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1) + self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1) + @slow def test_tokenizer_integration(self): # fmt: off