From 2550b41aa2ec34f05ddfd3ec5875ddb32ad78d58 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Thu, 15 Apr 2021 09:32:32 -0400 Subject: [PATCH] Tokenizer fast save (#11234) * Save fast tokenizers in both formats * Fix for HerBERT * Proper fix * Properly test new behavior --- .../models/herbert/tokenization_herbert.py | 4 +- src/transformers/tokenization_utils_base.py | 49 ++++++++++++------- src/transformers/tokenization_utils_fast.py | 16 +++--- tests/test_tokenization_common.py | 43 +++++++++++++++- 4 files changed, 83 insertions(+), 29 deletions(-) diff --git a/src/transformers/models/herbert/tokenization_herbert.py b/src/transformers/models/herbert/tokenization_herbert.py index 0c9c90c818..5a8a1bba57 100644 --- a/src/transformers/models/herbert/tokenization_herbert.py +++ b/src/transformers/models/herbert/tokenization_herbert.py @@ -58,7 +58,7 @@ class HerbertTokenizer(XLMTokenizer): pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - def __init__(self, **kwargs): + def __init__(self, *args, **kwargs): kwargs["cls_token"] = "" kwargs["unk_token"] = "" @@ -68,7 +68,7 @@ class HerbertTokenizer(XLMTokenizer): kwargs["do_lowercase_and_remove_accent"] = False kwargs["additional_special_tokens"] = [] - super().__init__(**kwargs) + super().__init__(*args, **kwargs) self.bert_pre_tokenizer = BasicTokenizer( do_lower_case=False, never_split=self.all_special_tokens, tokenize_chinese_chars=False, strip_accents=False ) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index af7b27e303..a839f9012a 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1818,10 +1818,22 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1])) for token, index in added_tok_encoder_sorted: - assert index == len(tokenizer), ( - f"Non-consecutive added token '{token}' found. " - f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary." - ) + if has_tokenizer_file and index != len(tokenizer) and tokenizer.convert_tokens_to_ids(token) != index: + # Tokenizer fast: added token needs to either be in the vocabulary with the proper index or the + # index is the current length of the tokenizer (not in vocabulary) + raise ValueError( + f"Wrong index found for {token}: should be {tokenizer.convert_tokens_to_ids(token)} but found " + f"{index}." + ) + elif not has_tokenizer_file and index != len(tokenizer): + # Tokenizer slow: added token cannot already be in the vocabulary so its index needs to be the + # current length of the tokenizer. + raise ValueError( + f"Non-consecutive added token '{token}' found. " + f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary." + ) + + # Safe to call on a tokenizer fast even if token already there. tokenizer.add_tokens(token, special_tokens=bool(token in special_tokens)) # Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab @@ -1836,7 +1848,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): def save_pretrained( self, save_directory: Union[str, os.PathLike], - legacy_format: bool = True, + legacy_format: Optional[bool] = None, filename_prefix: Optional[str] = None, ) -> Tuple[str]: """ @@ -1844,13 +1856,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): This method make sure the full tokenizer can then be re-loaded using the - :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained` class method. - - .. Note:: - A "fast" tokenizer (instance of :class:`transformers.PreTrainedTokenizerFast`) saved with this method will - not be possible to load back in a "slow" tokenizer, i.e. in a :class:`transformers.PreTrainedTokenizer` - instance. It can only be loaded in a "fast" tokenizer, i.e. in a - :class:`transformers.PreTrainedTokenizerFast` instance. + :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained` class method.. .. Warning:: This won't save modifications you may have applied to the tokenizer after the instantiation (for instance, @@ -1858,11 +1864,16 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): Args: save_directory (:obj:`str` or :obj:`os.PathLike`): The path to a directory where the tokenizer will be saved. - legacy_format (:obj:`bool`, `optional`, defaults to :obj:`True`): - Whether to save the tokenizer in legacy format (default), i.e. with tokenizer specific vocabulary and a - separate added_tokens files or in the unified JSON file format for the `tokenizers` library. It's only - possible to save a Fast tokenizer in the unified JSON format and this format is incompatible with - "slow" tokenizers (not powered by the `tokenizers` library). + legacy_format (:obj:`bool`, `optional`): + Only applicable for a fast tokenizer. If unset (default), will save the tokenizer in the unified JSON + format as well as in legacy format, i.e. with tokenizer specific vocabulary and a separate added_tokens + files. + + If :obj:`False`, will only save the tokenizer in the unified JSON format. This format is incompatible + with "slow" tokenizers (not powered by the `tokenizers` library), so the tokenizer will not be able to + be loaded in the corresponding "slow" tokenizer. + + If :obj:`True`, will save the tokenizer in legacy format. filename_prefix: (:obj:`str`, `optional`): A prefix to add to the names of the files saved by the tokenizer. @@ -1925,7 +1936,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): self, save_directory: Union[str, os.PathLike], file_names: Tuple[str], - legacy_format: bool = True, + legacy_format: Optional[bool] = None, filename_prefix: Optional[str] = None, ) -> Tuple[str]: """ @@ -1934,7 +1945,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the specific :meth:`~transformers.tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained` """ - if not legacy_format: + if legacy_format is False: raise ValueError( "Only fast tokenizers (instances of PreTrainedTokenizerFast) can be saved in non legacy format." ) diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py index 706ee7e22c..df4dec0758 100644 --- a/src/transformers/tokenization_utils_fast.py +++ b/src/transformers/tokenization_utils_fast.py @@ -516,18 +516,19 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): self, save_directory: Union[str, os.PathLike], file_names: Tuple[str], - legacy_format: bool = True, + legacy_format: Optional[bool] = None, filename_prefix: Optional[str] = None, ) -> Tuple[str]: """ - Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens. - - Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the - specific :meth:`~transformers.PreTrainedTokenizerFast._save_pretrained` + Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well asin a unique JSON + file containing {config + vocab + added-tokens}. """ save_directory = str(save_directory) - if legacy_format: + save_slow = legacy_format is None or legacy_format is True + save_fast = legacy_format is None or legacy_format is False + + if save_slow: added_tokens_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE ) @@ -539,7 +540,8 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix) file_names = file_names + vocab_files + (added_tokens_file,) - else: + + if save_fast: tokenizer_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_FILE ) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index f1f7afca62..aa83b749d4 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -2729,7 +2729,10 @@ class TokenizerTesterMixin: tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2) tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) - # Checks it save with the same files + + # Checks it save with the same files + the tokenizer.json file for the fast one + self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files)) + tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f) self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files) # Checks everything loads correctly in the same way @@ -2744,6 +2747,44 @@ class TokenizerTesterMixin: shutil.rmtree(tmpdirname2) + # Save tokenizer rust, legacy_format=True + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + + # Checks it save with the same files + self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + + shutil.rmtree(tmpdirname2) + + # Save tokenizer rust, legacy_format=False + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + + # Checks it saved the tokenizer.json file + self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files)) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + + shutil.rmtree(tmpdirname2) + def test_embeded_special_tokens(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):