[T5 Tokenizer] Model has no fixed position ids - there is no hardcode… (#16990)
* [T5 Tokenizer] Model has no fixed position ids - there is no hardcoded max length * [T5 Tokenizer] Model has no fixed position ids - there is no hardcoded max length * correct t5 tokenizer * correct t5 tokenizer * fix test * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * finish Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
1073f00d4e
commit
31616b8d61
@@ -41,6 +41,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# TODO(PVP) - this should be removed in Transformers v5
|
||||||
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||||
"t5-small": 512,
|
"t5-small": 512,
|
||||||
"t5-base": 512,
|
"t5-base": 512,
|
||||||
@@ -151,6 +153,24 @@ class T5Tokenizer(PreTrainedTokenizer):
|
|||||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||||
self.sp_model.Load(vocab_file)
|
self.sp_model.Load(vocab_file)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length):
|
||||||
|
if pretrained_model_name_or_path in T5Tokenizer.max_model_input_sizes:
|
||||||
|
deprecated_max_model_length = T5Tokenizer.max_model_input_sizes[pretrained_model_name_or_path]
|
||||||
|
if init_max_model_length is not None and init_max_model_length != max_model_length:
|
||||||
|
return init_max_model_length
|
||||||
|
elif init_max_model_length is None:
|
||||||
|
warnings.warn(
|
||||||
|
f"This tokenizer was incorrectly instantiated with a model max length of {deprecated_max_model_length} which will be corrected in Transformers v5.\n"
|
||||||
|
f"For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.\n"
|
||||||
|
f"- Be aware that you SHOULD NOT rely on {pretrained_model_name_or_path} automatically truncating your input to {deprecated_max_model_length} when padding/encoding.\n"
|
||||||
|
f"- If you want to encode/pad to sequences longer than {deprecated_max_model_length} you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.\n"
|
||||||
|
f"- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
|
||||||
|
return max_model_length
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def vocab_size(self):
|
def vocab_size(self):
|
||||||
return self.sp_model.get_piece_size() + self._extra_ids
|
return self.sp_model.get_piece_size() + self._extra_ids
|
||||||
|
|||||||
@@ -16,6 +16,7 @@
|
|||||||
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import warnings
|
||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
from typing import List, Optional, Tuple
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
@@ -50,6 +51,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# TODO(PVP) - this should be removed in Transformers v5
|
||||||
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||||
"t5-small": 512,
|
"t5-small": 512,
|
||||||
"t5-base": 512,
|
"t5-base": 512,
|
||||||
@@ -142,6 +145,24 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
|
|||||||
self.can_save_slow_tokenizer = False if not self.vocab_file else True
|
self.can_save_slow_tokenizer = False if not self.vocab_file else True
|
||||||
self._extra_ids = extra_ids
|
self._extra_ids = extra_ids
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length):
|
||||||
|
if pretrained_model_name_or_path in T5TokenizerFast.max_model_input_sizes:
|
||||||
|
deprecated_max_model_length = T5TokenizerFast.max_model_input_sizes[pretrained_model_name_or_path]
|
||||||
|
if init_max_model_length is not None and init_max_model_length != max_model_length:
|
||||||
|
return init_max_model_length
|
||||||
|
elif init_max_model_length is None:
|
||||||
|
warnings.warn(
|
||||||
|
f"This tokenizer was incorrectly instantiated with a model max length of {deprecated_max_model_length} which will be corrected in Transformers v5.\n"
|
||||||
|
f"For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.\n"
|
||||||
|
f"- Be aware that you SHOULD NOT rely on {pretrained_model_name_or_path} automatically truncating your input to {deprecated_max_model_length} when padding/encoding.\n"
|
||||||
|
f"- If you want to encode/pad to sequences longer than {deprecated_max_model_length} you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.\n"
|
||||||
|
f"- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
|
||||||
|
return max_model_length
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
if not self.can_save_slow_tokenizer:
|
if not self.can_save_slow_tokenizer:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
|||||||
@@ -1899,9 +1899,19 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
|||||||
if pretrained_model_name_or_path in cls.max_model_input_sizes:
|
if pretrained_model_name_or_path in cls.max_model_input_sizes:
|
||||||
# if we're using a pretrained model, ensure the tokenizer
|
# if we're using a pretrained model, ensure the tokenizer
|
||||||
# wont index sequences longer than the number of positional embeddings
|
# wont index sequences longer than the number of positional embeddings
|
||||||
|
|
||||||
model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path]
|
model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path]
|
||||||
if model_max_length is not None and isinstance(model_max_length, (int, float)):
|
if model_max_length is not None and isinstance(model_max_length, (int, float)):
|
||||||
init_kwargs["model_max_length"] = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length)
|
|
||||||
|
model_max_length = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length)
|
||||||
|
# TODO(PVP) - uncomment following line in Transformers v5
|
||||||
|
# init_kwargs["model_max_length"] = model_max_length
|
||||||
|
# TODO(PVP) - remove in Transformers v5
|
||||||
|
# ---
|
||||||
|
init_kwargs["model_max_length"] = cls._eventually_correct_t5_max_length(
|
||||||
|
pretrained_model_name_or_path, model_max_length, init_kwargs.get("model_max_length")
|
||||||
|
)
|
||||||
|
# ---
|
||||||
|
|
||||||
# Merge resolved_vocab_files arguments in init_kwargs.
|
# Merge resolved_vocab_files arguments in init_kwargs.
|
||||||
added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
|
added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
|
||||||
@@ -1983,6 +1993,14 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
|||||||
|
|
||||||
return tokenizer
|
return tokenizer
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length):
|
||||||
|
# This method should be deleted in Transformers v5
|
||||||
|
# Its only purpose is to potentially throw a warning
|
||||||
|
# that incorrectly defined max lengths of T5's tokenizer are used
|
||||||
|
# which we will correct in Transformers v5.
|
||||||
|
return max_model_length
|
||||||
|
|
||||||
def save_pretrained(
|
def save_pretrained(
|
||||||
self,
|
self,
|
||||||
save_directory: Union[str, os.PathLike],
|
save_directory: Union[str, os.PathLike],
|
||||||
|
|||||||
@@ -223,6 +223,9 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
["I am a small frog" * 1000, "I am a small frog"], padding=True, truncation=True, return_tensors=FRAMEWORK
|
["I am a small frog" * 1000, "I am a small frog"], padding=True, truncation=True, return_tensors=FRAMEWORK
|
||||||
)
|
)
|
||||||
self.assertIsInstance(batch, BatchEncoding)
|
self.assertIsInstance(batch, BatchEncoding)
|
||||||
|
# Since T5 does NOT have a max input length,
|
||||||
|
# this test should be changed to the following in Transformers v5:
|
||||||
|
# self.assertEqual(batch.input_ids.shape, (2, 8001))
|
||||||
self.assertEqual(batch.input_ids.shape, (2, 512))
|
self.assertEqual(batch.input_ids.shape, (2, 512))
|
||||||
|
|
||||||
def test_eos_in_input(self):
|
def test_eos_in_input(self):
|
||||||
@@ -361,6 +364,13 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# overwritten from `test_tokenization_common` since T5 has no max length
|
||||||
|
def test_pretrained_model_lists(self):
|
||||||
|
# We should have at least one default checkpoint for each tokenizer
|
||||||
|
# We should specify the max input length as well (used in some part to list the pretrained checkpoints)
|
||||||
|
self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1)
|
||||||
|
self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
def test_tokenizer_integration(self):
|
def test_tokenizer_integration(self):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
|
|||||||
Reference in New Issue
Block a user