Tokenizer fast save (#11234)
* Save fast tokenizers in both formats * Fix for HerBERT * Proper fix * Properly test new behavior
This commit is contained in:
@@ -1818,10 +1818,22 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1]))
|
||||
|
||||
for token, index in added_tok_encoder_sorted:
|
||||
assert index == len(tokenizer), (
|
||||
f"Non-consecutive added token '{token}' found. "
|
||||
f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary."
|
||||
)
|
||||
if has_tokenizer_file and index != len(tokenizer) and tokenizer.convert_tokens_to_ids(token) != index:
|
||||
# Tokenizer fast: added token needs to either be in the vocabulary with the proper index or the
|
||||
# index is the current length of the tokenizer (not in vocabulary)
|
||||
raise ValueError(
|
||||
f"Wrong index found for {token}: should be {tokenizer.convert_tokens_to_ids(token)} but found "
|
||||
f"{index}."
|
||||
)
|
||||
elif not has_tokenizer_file and index != len(tokenizer):
|
||||
# Tokenizer slow: added token cannot already be in the vocabulary so its index needs to be the
|
||||
# current length of the tokenizer.
|
||||
raise ValueError(
|
||||
f"Non-consecutive added token '{token}' found. "
|
||||
f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary."
|
||||
)
|
||||
|
||||
# Safe to call on a tokenizer fast even if token already there.
|
||||
tokenizer.add_tokens(token, special_tokens=bool(token in special_tokens))
|
||||
|
||||
# Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab
|
||||
@@ -1836,7 +1848,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
def save_pretrained(
|
||||
self,
|
||||
save_directory: Union[str, os.PathLike],
|
||||
legacy_format: bool = True,
|
||||
legacy_format: Optional[bool] = None,
|
||||
filename_prefix: Optional[str] = None,
|
||||
) -> Tuple[str]:
|
||||
"""
|
||||
@@ -1844,13 +1856,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
|
||||
|
||||
This method make sure the full tokenizer can then be re-loaded using the
|
||||
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained` class method.
|
||||
|
||||
.. Note::
|
||||
A "fast" tokenizer (instance of :class:`transformers.PreTrainedTokenizerFast`) saved with this method will
|
||||
not be possible to load back in a "slow" tokenizer, i.e. in a :class:`transformers.PreTrainedTokenizer`
|
||||
instance. It can only be loaded in a "fast" tokenizer, i.e. in a
|
||||
:class:`transformers.PreTrainedTokenizerFast` instance.
|
||||
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained` class method..
|
||||
|
||||
.. Warning::
|
||||
This won't save modifications you may have applied to the tokenizer after the instantiation (for instance,
|
||||
@@ -1858,11 +1864,16 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
|
||||
Args:
|
||||
save_directory (:obj:`str` or :obj:`os.PathLike`): The path to a directory where the tokenizer will be saved.
|
||||
legacy_format (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether to save the tokenizer in legacy format (default), i.e. with tokenizer specific vocabulary and a
|
||||
separate added_tokens files or in the unified JSON file format for the `tokenizers` library. It's only
|
||||
possible to save a Fast tokenizer in the unified JSON format and this format is incompatible with
|
||||
"slow" tokenizers (not powered by the `tokenizers` library).
|
||||
legacy_format (:obj:`bool`, `optional`):
|
||||
Only applicable for a fast tokenizer. If unset (default), will save the tokenizer in the unified JSON
|
||||
format as well as in legacy format, i.e. with tokenizer specific vocabulary and a separate added_tokens
|
||||
files.
|
||||
|
||||
If :obj:`False`, will only save the tokenizer in the unified JSON format. This format is incompatible
|
||||
with "slow" tokenizers (not powered by the `tokenizers` library), so the tokenizer will not be able to
|
||||
be loaded in the corresponding "slow" tokenizer.
|
||||
|
||||
If :obj:`True`, will save the tokenizer in legacy format.
|
||||
filename_prefix: (:obj:`str`, `optional`):
|
||||
A prefix to add to the names of the files saved by the tokenizer.
|
||||
|
||||
@@ -1925,7 +1936,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
self,
|
||||
save_directory: Union[str, os.PathLike],
|
||||
file_names: Tuple[str],
|
||||
legacy_format: bool = True,
|
||||
legacy_format: Optional[bool] = None,
|
||||
filename_prefix: Optional[str] = None,
|
||||
) -> Tuple[str]:
|
||||
"""
|
||||
@@ -1934,7 +1945,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
|
||||
specific :meth:`~transformers.tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`
|
||||
"""
|
||||
if not legacy_format:
|
||||
if legacy_format is False:
|
||||
raise ValueError(
|
||||
"Only fast tokenizers (instances of PreTrainedTokenizerFast) can be saved in non legacy format."
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user