From 7b8bdd8601f1c9cc91434c858b848a34126a0a83 Mon Sep 17 00:00:00 2001 From: SaulLu <55560583+SaulLu@users.noreply.github.com> Date: Tue, 1 Feb 2022 16:48:25 +0100 Subject: [PATCH] fix the `tokenizer_config.json` file for the slow tokenizer when a fast version is available (#15319) * add new test * update test * remove `tokenizer_file` from `additional_files_names` in `tokenization_utils_base.py` * add `tokenizer_file` for the fast only tokenizer * change global variables layoutxml * remove `"tokenizer_file"` from DPR tokenizer's Global variables * remove `tokenizer_file` from herbert slow tokenizer init * `"tokenizer_file"` from LED tokenizer's Global variables * remove `tokenizer_file` from mbart slow tokenizer init * remove `tokenizer_file` from slow tokenizer template * adapt to versioning * adapt the `test_tokenizer_mismatch_warning` test * clean test * clarify `VOCAB_FILES_NAMES` in tokenization_utils_fast.py * Revert "remove `tokenizer_file` from mbart slow tokenizer init" This reverts commit 0dbb723fa9c7599d4640fe30b3647a74eb4a64e1. * Revert "`"tokenizer_file"` from LED tokenizer's Global variables" This reverts commit 5a3f879bdd651233f3d74a3d1146c34cde82b0c2. * Revert "remove `tokenizer_file` from herbert slow tokenizer init" This reverts commit f5e10007b7b0ec5345e015b9de7ffec72c5407fd. * Revert "remove `"tokenizer_file"` from DPR tokenizer's Global variables" This reverts commit da0895330bedfafc81ae3073470a9348c669f032. * set `tokenizer_file` in super `__init__` of mbart --- .../layoutxlm/tokenization_layoutxlm_fast.py | 2 +- .../models/mbart/tokenization_mbart.py | 2 +- src/transformers/tokenization_utils_base.py | 48 ++++++++++--------- src/transformers/tokenization_utils_fast.py | 4 +- ...on_{{cookiecutter.lowercase_modelname}}.py | 5 +- tests/test_tokenization_common.py | 44 +++++++++++++++-- 6 files changed, 72 insertions(+), 33 deletions(-) diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py index 47dd362505..ca06657958 100644 --- a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py +++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py @@ -34,7 +34,7 @@ from ...tokenization_utils_base import ( ) from ...tokenization_utils_fast import PreTrainedTokenizerFast from ...utils import logging -from ..xlm_roberta.tokenization_xlm_roberta import ( +from ..xlm_roberta.tokenization_xlm_roberta_fast import ( PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES, PRETRAINED_VOCAB_FILES_MAP, VOCAB_FILES_NAMES, diff --git a/src/transformers/models/mbart/tokenization_mbart.py b/src/transformers/models/mbart/tokenization_mbart.py index e6d3ff4337..d6ea6260ae 100644 --- a/src/transformers/models/mbart/tokenization_mbart.py +++ b/src/transformers/models/mbart/tokenization_mbart.py @@ -110,7 +110,7 @@ class MBartTokenizer(PreTrainedTokenizer): cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, - tokenizer_file=tokenizer_file, + tokenizer_file=None, src_lang=src_lang, tgt_lang=tgt_lang, additional_special_tokens=additional_special_tokens, diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index ebd83e4214..ed1fc2f10e 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1649,34 +1649,36 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): vocab_files[file_id] = pretrained_model_name_or_path else: # At this point pretrained_model_name_or_path is either a directory or a model identifier name - - # Try to get the tokenizer config to see if there are versioned tokenizer files. - fast_tokenizer_file = FULL_TOKENIZER_FILE - resolved_config_file = get_file_from_repo( - pretrained_model_name_or_path, - TOKENIZER_CONFIG_FILE, - cache_dir=cache_dir, - force_download=force_download, - resume_download=resume_download, - proxies=proxies, - use_auth_token=use_auth_token, - revision=revision, - local_files_only=local_files_only, - ) - if resolved_config_file is not None: - with open(resolved_config_file, encoding="utf-8") as reader: - tokenizer_config = json.load(reader) - if "fast_tokenizer_files" in tokenizer_config: - fast_tokenizer_file = get_fast_tokenizer_file(tokenizer_config["fast_tokenizer_files"]) - additional_files_names = { "added_tokens_file": ADDED_TOKENS_FILE, "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, "tokenizer_config_file": TOKENIZER_CONFIG_FILE, - "tokenizer_file": fast_tokenizer_file, } + vocab_files_target = {**cls.vocab_files_names, **additional_files_names} + + if "tokenizer_file" in vocab_files_target: + # Try to get the tokenizer config to see if there are versioned tokenizer files. + fast_tokenizer_file = FULL_TOKENIZER_FILE + resolved_config_file = get_file_from_repo( + pretrained_model_name_or_path, + TOKENIZER_CONFIG_FILE, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + use_auth_token=use_auth_token, + revision=revision, + local_files_only=local_files_only, + ) + if resolved_config_file is not None: + with open(resolved_config_file, encoding="utf-8") as reader: + tokenizer_config = json.load(reader) + if "fast_tokenizer_files" in tokenizer_config: + fast_tokenizer_file = get_fast_tokenizer_file(tokenizer_config["fast_tokenizer_files"]) + vocab_files_target["tokenizer_file"] = fast_tokenizer_file + # Look for the tokenizer files - for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items(): + for file_id, file_name in vocab_files_target.items(): if os.path.isdir(pretrained_model_name_or_path): if subfolder is not None: full_file_name = os.path.join(pretrained_model_name_or_path, subfolder, file_name) @@ -1758,7 +1760,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): f"Can't load tokenizer for '{pretrained_model_name_or_path}'. If you were trying to load it from " "'https://huggingface.co/models', make sure you don't have a local directory with the same name. " f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory " - f"containing all relevant tokenizer files." + f"containing all relevant files for a {cls.__name__} tokenizer." ) for file_id, file_path in vocab_files.items(): diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py index 3985150e5f..d14b02d11f 100644 --- a/src/transformers/tokenization_utils_fast.py +++ b/src/transformers/tokenization_utils_fast.py @@ -46,7 +46,6 @@ from .utils import logging logger = logging.get_logger(__name__) - # Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file TOKENIZER_FILE = "tokenizer.json" SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" @@ -71,6 +70,8 @@ MODEL_TO_TRAINER_MAPPING = { "WordPiece": WordPieceTrainer, } +VOCAB_FILES_NAMES = {"tokenizer_file": TOKENIZER_FILE} + @add_end_docstrings(INIT_TOKENIZER_DOCSTRING) class PreTrainedTokenizerFast(PreTrainedTokenizerBase): @@ -86,6 +87,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...). """ + vocab_files_names = VOCAB_FILES_NAMES slow_tokenizer_class: PreTrainedTokenizer = None can_save_slow_tokenizer: bool = True diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py index 9de18085a3..a3ad1dd7c9 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py @@ -62,7 +62,7 @@ from ..bart.tokenization_bart import BartTokenizer logger = logging.get_logger(__name__) -VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} +VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"} PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { @@ -71,9 +71,6 @@ PRETRAINED_VOCAB_FILES_MAP = { "merges_file": { "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/merges.txt", }, - "tokenizer_file": { - "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/tokenizer.json", - }, } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 0cfbeb7f53..6e1650f260 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -3604,15 +3604,24 @@ class TokenizerTesterMixin: AlbertTokenizer.from_pretrained(pretrained_name) else: BertTokenizer.from_pretrained(pretrained_name) + except EnvironmentError as e: + # Some tokenizer will raised an error before reaching the logged warning because there are no + # corresponding files to load + error_message = str(e) except (TypeError, AttributeError): # Some tokenizers cannot be loaded into the target tokenizer at all and errors are returned, # here we just check that the warning has been logged before the error is raised pass finally: + logged_msg_target = ( + "The tokenizer class you load from this checkpoint is not the same type as the class " + "this function is called from." + ) + raised_error_msg_target = "Can't load tokenizer for" self.assertTrue( - cm.records[0].message.startswith( - "The tokenizer class you load from this checkpoint is not the same type as the class this function is called from." - ) + cm.records[0].message.startswith(logged_msg_target) + if len(cm.records) > 0 + else False or raised_error_msg_target in error_message ) try: if self.rust_tokenizer_class == BertTokenizerFast: @@ -3651,6 +3660,35 @@ class TokenizerTesterMixin: trainer.save_model(os.path.join(tmp_dir, "checkpoint")) self.assertIn("tokenizer.json", os.listdir(os.path.join(tmp_dir, "checkpoint"))) + def test_save_slow_from_fast_and_reload_fast(self): + if not self.test_slow_tokenizer or not self.test_rust_tokenizer: + # we need both slow and fast versions + return + + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + with tempfile.TemporaryDirectory() as tmp_dir_1: + # Here we check that even if we have initialized a fast tokenizer with a tokenizer_file we can + # still save only the slow version and use these saved files to rebuild a tokenizer + tokenizer_fast_old_1 = self.rust_tokenizer_class.from_pretrained( + pretrained_name, **kwargs, use_fast=True + ) + tokenizer_file = os.path.join(tmp_dir_1, "tokenizer.json") + tokenizer_fast_old_1.backend_tokenizer.save(tokenizer_file) + + tokenizer_fast_old_2 = self.rust_tokenizer_class.from_pretrained( + pretrained_name, **kwargs, use_fast=True, tokenizer_file=tokenizer_file + ) + + tokenizer_fast_old_2.save_pretrained(tmp_dir_1, legacy_format=True) # save only slow version + + tokenizer_slow = self.tokenizer_class.from_pretrained(tmp_dir_1) + with tempfile.TemporaryDirectory() as tmp_dir_2: + tokenizer_slow.save_pretrained(tmp_dir_2) + + # Should not raise an error + self.rust_tokenizer_class.from_pretrained(tmp_dir_2) + class FakeTokenizer(BertTokenizer): pass