Adding Fast tokenizers for SentencePiece based tokenizers - Breaking: remove Transfo-XL fast tokenizer (#7141)
* [WIP] SP tokenizers * fixing tests for T5 * WIP tokenizers * serialization * update T5 * WIP T5 tokenization * slow to fast conversion script * Refactoring to move tokenzier implementations inside transformers * Adding gpt - refactoring - quality * WIP adding several tokenizers to the fast world * WIP Roberta - moving implementations * update to dev4 switch file loading to in-memory loading * Updating and fixing * advancing on the tokenizers - updating do_lower_case * style and quality * moving forward with tokenizers conversion and tests * MBart, T5 * dumping the fast version of transformer XL * Adding to autotokenizers + style/quality * update init and space_between_special_tokens * style and quality * bump up tokenizers version * add protobuf * fix pickle Bert JP with Mecab * fix newly added tokenizers * style and quality * fix bert japanese * fix funnel * limite tokenizer warning to one occurence * clean up file * fix new tokenizers * fast tokenizers deep tests * WIP adding all the special fast tests on the new fast tokenizers * quick fix * adding more fast tokenizers in the fast tests * all tokenizers in fast version tested * Adding BertGenerationFast * bump up setup.py for CI * remove BertGenerationFast (too early) * bump up tokenizers version * Clean old docstrings * Typo * Update following Lysandre comments Co-authored-by: Sylvain Gugger <sylvain.gugger@gmail.com>
This commit is contained in:
@@ -646,6 +646,8 @@ class SpecialTokensMixin:
|
||||
# which are not yet in the vocabulary. Necesssary for serialization/de-serialization
|
||||
# TODO clean this up at some point (probably by sitching to fast tokenizers)
|
||||
for key, value in kwargs.items():
|
||||
if value is None:
|
||||
continue
|
||||
if key in self.SPECIAL_TOKENS_ATTRIBUTES:
|
||||
if key == "additional_special_tokens":
|
||||
assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple"
|
||||
@@ -778,6 +780,9 @@ class SpecialTokensMixin:
|
||||
|
||||
return self._add_tokens(new_tokens, special_tokens=special_tokens)
|
||||
|
||||
def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def bos_token(self) -> str:
|
||||
"""
|
||||
@@ -1293,11 +1298,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
max_model_input_sizes: Dict[str, Optional[int]] = {}
|
||||
model_input_names: List[str] = ["token_type_ids", "attention_mask"]
|
||||
padding_side: str = "right"
|
||||
slow_tokenizer_class = None
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
# inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
|
||||
self.init_inputs = ()
|
||||
self.init_kwargs = kwargs
|
||||
self.init_kwargs = copy.deepcopy(kwargs)
|
||||
|
||||
# For backward compatibility we fallback to set model_max_length from max_len if provided
|
||||
model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
|
||||
@@ -1311,6 +1317,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
], f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}"
|
||||
self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
|
||||
|
||||
self.deprecation_warnings = (
|
||||
{}
|
||||
) # Use to store when we have already noticed a deprecation warning (avoid overlogging).
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@property
|
||||
@@ -1343,9 +1353,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
def max_len_single_sentence(self, value) -> int:
|
||||
# For backward compatibility, allow to try to setup 'max_len_single_sentence'.
|
||||
if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose:
|
||||
logger.warning(
|
||||
"Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
|
||||
)
|
||||
if not self.deprecation_warnings.get("max_len_single_sentence", False):
|
||||
logger.warning(
|
||||
"Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
|
||||
)
|
||||
self.deprecation_warnings["max_len_single_sentence"] = True
|
||||
else:
|
||||
raise ValueError(
|
||||
"Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
|
||||
@@ -1355,16 +1367,18 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
def max_len_sentences_pair(self, value) -> int:
|
||||
# For backward compatibility, allow to try to setup 'max_len_sentences_pair'.
|
||||
if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose:
|
||||
logger.warning(
|
||||
"Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
|
||||
)
|
||||
if not self.deprecation_warnings.get("max_len_sentences_pair", False):
|
||||
logger.warning(
|
||||
"Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
|
||||
)
|
||||
self.deprecation_warnings["max_len_sentences_pair"] = True
|
||||
else:
|
||||
raise ValueError(
|
||||
"Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, *inputs, **kwargs):
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
|
||||
r"""
|
||||
Instantiate a :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase` (or a derived class) from
|
||||
a predefined tokenizer.
|
||||
@@ -1425,10 +1439,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
assert tokenizer.unk_token == '<unk>'
|
||||
|
||||
"""
|
||||
return cls._from_pretrained(*inputs, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
|
||||
cache_dir = kwargs.pop("cache_dir", None)
|
||||
force_download = kwargs.pop("force_download", False)
|
||||
resume_download = kwargs.pop("resume_download", False)
|
||||
@@ -1475,7 +1485,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
"added_tokens_file": ADDED_TOKENS_FILE,
|
||||
"special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
|
||||
"tokenizer_config_file": TOKENIZER_CONFIG_FILE,
|
||||
"full_tokenizer_file": FULL_TOKENIZER_FILE,
|
||||
"tokenizer_file": FULL_TOKENIZER_FILE,
|
||||
}
|
||||
# Look for the tokenizer files
|
||||
for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items():
|
||||
@@ -1541,6 +1551,28 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
else:
|
||||
logger.info("loading file {} from cache at {}".format(file_path, resolved_vocab_files[file_id]))
|
||||
|
||||
return cls._from_pretrained(
|
||||
resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _from_pretrained(
|
||||
cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs
|
||||
):
|
||||
# We instantiate fast tokenizers based on a slow tokenizer for now
|
||||
# In the future we can also use a direct way based on saving/instantiating
|
||||
# tokenizer's Tokenizer directly from it's serialization JSON
|
||||
if cls.slow_tokenizer_class is not None:
|
||||
slow_tokenizer = cls.slow_tokenizer_class._from_pretrained(
|
||||
copy.deepcopy(resolved_vocab_files),
|
||||
pretrained_model_name_or_path,
|
||||
copy.deepcopy(init_configuration),
|
||||
*init_inputs,
|
||||
**(copy.deepcopy(kwargs)),
|
||||
)
|
||||
else:
|
||||
slow_tokenizer = None
|
||||
|
||||
# Prepare tokenizer initialization kwargs
|
||||
# Did we saved some inputs and kwargs to reload ?
|
||||
tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
|
||||
@@ -1556,6 +1588,19 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
# Update with newly provided kwargs
|
||||
init_kwargs.update(kwargs)
|
||||
|
||||
# Convert AddedTokens serialized as dict to class instances
|
||||
def convert_added_tokens(obj: Union[AddedToken, Any]):
|
||||
if isinstance(obj, dict) and "__type" in obj and obj["__type"] == "AddedToken":
|
||||
obj.pop("__type")
|
||||
return AddedToken(**obj)
|
||||
elif isinstance(obj, (list, tuple)):
|
||||
return list(convert_added_tokens(o) for o in obj)
|
||||
elif isinstance(obj, dict):
|
||||
return {k: convert_added_tokens(v) for k, v in obj.items()}
|
||||
return obj
|
||||
|
||||
init_kwargs = convert_added_tokens(init_kwargs)
|
||||
|
||||
# Set max length if needed
|
||||
if pretrained_model_name_or_path in cls.max_model_input_sizes:
|
||||
# if we're using a pretrained model, ensure the tokenizer
|
||||
@@ -1570,6 +1615,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
if args_name not in init_kwargs:
|
||||
init_kwargs[args_name] = file_path
|
||||
|
||||
if slow_tokenizer is not None:
|
||||
init_kwargs["__slow_tokenizer"] = slow_tokenizer
|
||||
|
||||
# Instantiate tokenizer.
|
||||
try:
|
||||
tokenizer = cls(*init_inputs, **init_kwargs)
|
||||
@@ -1580,8 +1628,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
)
|
||||
|
||||
# Save inputs and kwargs for saving and re-loading with ``save_pretrained``
|
||||
tokenizer.init_inputs = init_inputs
|
||||
tokenizer.init_kwargs = init_kwargs
|
||||
# Removed: Now done at the base class level
|
||||
# tokenizer.init_inputs = init_inputs
|
||||
# tokenizer.init_kwargs = init_kwargs
|
||||
|
||||
# If there is a complementary special token map, load it
|
||||
special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
|
||||
@@ -1589,11 +1638,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
|
||||
special_tokens_map = json.load(special_tokens_map_handle)
|
||||
|
||||
special_tokens_map = convert_added_tokens(special_tokens_map)
|
||||
for key, value in special_tokens_map.items():
|
||||
if isinstance(value, dict):
|
||||
value = AddedToken(**value)
|
||||
elif isinstance(value, list):
|
||||
value = [AddedToken(**token) if isinstance(token, dict) else token for token in value]
|
||||
setattr(tokenizer, key, value)
|
||||
|
||||
# Add supplementary tokens.
|
||||
@@ -1623,14 +1669,17 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
|
||||
def save_pretrained(self, save_directory: str) -> Tuple[str]:
|
||||
"""
|
||||
Save the tokenizer vocabulary files together with:
|
||||
Save the full tokenizer state.
|
||||
|
||||
- added tokens,
|
||||
- special tokens to class attributes mapping,
|
||||
- tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert).
|
||||
|
||||
This method make sure the full tokenizer can then be re-loaded using the
|
||||
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained` class method.
|
||||
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained` class method.
|
||||
|
||||
.. Note::
|
||||
A "fast" tokenizer (instance of :class:`transformers.PreTrainedTokenizerFast`) saved with
|
||||
this method will not be possible to load back
|
||||
in a "slow" tokenizer, i.e. in a :class:`transformers.PreTrainedTokenizer` instance. It can only be loaded
|
||||
in a "fast" tokenizer, i.e. in a :class:`transformers.PreTrainedTokenizerFast` instance.
|
||||
|
||||
.. Warning::
|
||||
This won't save modifications you may have applied to the tokenizer after the instantiation (for instance,
|
||||
@@ -1648,7 +1697,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
os.makedirs(save_directory, exist_ok=True)
|
||||
|
||||
special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE)
|
||||
added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
|
||||
tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE)
|
||||
|
||||
tokenizer_config = copy.deepcopy(self.init_kwargs)
|
||||
@@ -1657,22 +1705,33 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
for file_id in self.vocab_files_names.keys():
|
||||
tokenizer_config.pop(file_id, None)
|
||||
|
||||
# Sanitize AddedTokens
|
||||
def convert_added_tokens(obj: Union[AddedToken, Any]):
|
||||
if isinstance(obj, AddedToken):
|
||||
out = obj.__getstate__()
|
||||
out["__type"] = "AddedToken"
|
||||
return out
|
||||
elif isinstance(obj, (list, tuple)):
|
||||
return list(convert_added_tokens(o) for o in obj)
|
||||
elif isinstance(obj, dict):
|
||||
return {k: convert_added_tokens(v) for k, v in obj.items()}
|
||||
return obj
|
||||
|
||||
tokenizer_config = convert_added_tokens(tokenizer_config)
|
||||
with open(tokenizer_config_file, "w", encoding="utf-8") as f:
|
||||
f.write(json.dumps(tokenizer_config, ensure_ascii=False))
|
||||
|
||||
# Sanitize AddedTokens in special_tokens_map
|
||||
write_dict = convert_added_tokens(self.special_tokens_map_extended)
|
||||
with open(special_tokens_map_file, "w", encoding="utf-8") as f:
|
||||
write_dict = {}
|
||||
for key, value in self.special_tokens_map_extended.items():
|
||||
if isinstance(value, AddedToken):
|
||||
write_dict[key] = value.__getstate__()
|
||||
elif isinstance(value, list):
|
||||
write_dict[key] = [
|
||||
token.__getstate__() if isinstance(token, AddedToken) else token for token in value
|
||||
]
|
||||
else:
|
||||
write_dict[key] = value
|
||||
f.write(json.dumps(write_dict, ensure_ascii=False))
|
||||
|
||||
file_names = (tokenizer_config_file, special_tokens_map_file)
|
||||
|
||||
return self._save_pretrained(save_directory, file_names)
|
||||
|
||||
def _save_pretrained(self, save_directory: str, file_names: Tuple[str]) -> Tuple[str]:
|
||||
added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
|
||||
added_vocab = self.get_added_vocab()
|
||||
if added_vocab:
|
||||
with open(added_tokens_file, "w", encoding="utf-8") as f:
|
||||
@@ -1681,7 +1740,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
|
||||
vocab_files = self.save_vocabulary(save_directory)
|
||||
|
||||
return vocab_files + (special_tokens_map_file, added_tokens_file)
|
||||
return file_names + (vocab_files, added_tokens_file)
|
||||
|
||||
@add_end_docstrings(
|
||||
ENCODE_KWARGS_DOCSTRING,
|
||||
@@ -1752,13 +1811,15 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
# If you only set max_length, it activates truncation for max_length
|
||||
if max_length is not None and padding is False and truncation is False:
|
||||
if verbose:
|
||||
logger.warning(
|
||||
"Truncation was not explicitely activated but `max_length` is provided a specific value, "
|
||||
"please use `truncation=True` to explicitely truncate examples to max length. "
|
||||
"Defaulting to 'longest_first' truncation strategy. "
|
||||
"If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy "
|
||||
"more precisely by providing a specific strategy to `truncation`."
|
||||
)
|
||||
if not self.deprecation_warnings.get("Truncation-not-explicitely-activated", False):
|
||||
logger.warning(
|
||||
"Truncation was not explicitely activated but `max_length` is provided a specific value, "
|
||||
"please use `truncation=True` to explicitely truncate examples to max length. "
|
||||
"Defaulting to 'longest_first' truncation strategy. "
|
||||
"If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy "
|
||||
"more precisely by providing a specific strategy to `truncation`."
|
||||
)
|
||||
self.deprecation_warnings["Truncation-not-explicitely-activated"] = True
|
||||
truncation = "longest_first"
|
||||
|
||||
# Get padding strategy
|
||||
@@ -1818,10 +1879,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
if padding_strategy == PaddingStrategy.MAX_LENGTH:
|
||||
if self.model_max_length > LARGE_INTEGER:
|
||||
if verbose:
|
||||
logger.warning(
|
||||
"Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. "
|
||||
"Default to no padding."
|
||||
)
|
||||
if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False):
|
||||
logger.warning(
|
||||
"Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. "
|
||||
"Default to no padding."
|
||||
)
|
||||
self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
|
||||
padding_strategy = PaddingStrategy.DO_NOT_PAD
|
||||
else:
|
||||
max_length = self.model_max_length
|
||||
@@ -1829,10 +1892,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
|
||||
if self.model_max_length > LARGE_INTEGER:
|
||||
if verbose:
|
||||
logger.warning(
|
||||
"Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. "
|
||||
"Default to no truncation."
|
||||
)
|
||||
if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False):
|
||||
logger.warning(
|
||||
"Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. "
|
||||
"Default to no truncation."
|
||||
)
|
||||
self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
|
||||
truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
|
||||
else:
|
||||
max_length = self.model_max_length
|
||||
@@ -2437,6 +2502,13 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
len_ids = len(ids)
|
||||
len_pair_ids = len(pair_ids) if pair else 0
|
||||
|
||||
if return_token_type_ids is not None and not add_special_tokens:
|
||||
raise ValueError(
|
||||
"Asking to return token_type_ids while setting add_special_tokens to False "
|
||||
"results in an undefined behavior. Please set add_special_tokens to True or "
|
||||
"set return_token_type_ids to None."
|
||||
)
|
||||
|
||||
# Load from model defaults
|
||||
if return_token_type_ids is None:
|
||||
return_token_type_ids = "token_type_ids" in self.model_input_names
|
||||
@@ -2469,7 +2541,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
|
||||
else:
|
||||
sequence = ids + pair_ids if pair else ids
|
||||
token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
|
||||
token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
|
||||
|
||||
# Build output dictionnary
|
||||
encoded_inputs["input_ids"] = sequence
|
||||
@@ -2483,11 +2555,13 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
|
||||
# Check lengths
|
||||
if max_length is None and len(encoded_inputs["input_ids"]) > self.model_max_length and verbose:
|
||||
logger.warning(
|
||||
"Token indices sequence length is longer than the specified maximum sequence length "
|
||||
"for this model ({} > {}). Running this sequence through the model will result in "
|
||||
"indexing errors".format(len(encoded_inputs["input_ids"]), self.model_max_length)
|
||||
)
|
||||
if not self.deprecation_warnings.get("sequence-length-is-longer-than-the-specified-maximum", False):
|
||||
logger.warning(
|
||||
"Token indices sequence length is longer than the specified maximum sequence length "
|
||||
"for this model ({} > {}). Running this sequence through the model will result in "
|
||||
"indexing errors".format(len(encoded_inputs["input_ids"]), self.model_max_length)
|
||||
)
|
||||
self.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = True
|
||||
|
||||
# Padding
|
||||
if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
|
||||
@@ -2703,7 +2777,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
]
|
||||
|
||||
def decode(
|
||||
self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True
|
||||
self,
|
||||
token_ids: List[int],
|
||||
skip_special_tokens: bool = False,
|
||||
clean_up_tokenization_spaces: bool = True,
|
||||
**kwargs
|
||||
) -> str:
|
||||
"""
|
||||
Converts a sequence of ids in a string, using the tokenizer and vocabulary
|
||||
|
||||
Reference in New Issue
Block a user