From 7b8bdd8601f1c9cc91434c858b848a34126a0a83 Mon Sep 17 00:00:00 2001
From: SaulLu <55560583+SaulLu@users.noreply.github.com>
Date: Tue, 1 Feb 2022 16:48:25 +0100
Subject: [PATCH] fix the `tokenizer_config.json` file for the slow tokenizer
 when a fast version is available (#15319)

* add new test

* update test

* remove `tokenizer_file` from `additional_files_names` in `tokenization_utils_base.py`

* add `tokenizer_file` for the fast only tokenizer

* change global variables layoutxml

* remove `"tokenizer_file"` from DPR tokenizer's Global variables

* remove `tokenizer_file` from herbert slow tokenizer init

* `"tokenizer_file"` from LED tokenizer's Global variables

* remove `tokenizer_file` from mbart slow tokenizer init

* remove `tokenizer_file` from slow tokenizer template

* adapt to versioning

* adapt the `test_tokenizer_mismatch_warning` test

* clean test

* clarify `VOCAB_FILES_NAMES` in tokenization_utils_fast.py

* Revert "remove `tokenizer_file` from mbart slow tokenizer init"

This reverts commit 0dbb723fa9c7599d4640fe30b3647a74eb4a64e1.

* Revert "`"tokenizer_file"` from LED tokenizer's Global variables"

This reverts commit 5a3f879bdd651233f3d74a3d1146c34cde82b0c2.

* Revert "remove `tokenizer_file` from herbert slow tokenizer init"

This reverts commit f5e10007b7b0ec5345e015b9de7ffec72c5407fd.

* Revert "remove `"tokenizer_file"` from DPR tokenizer's Global variables"

This reverts commit da0895330bedfafc81ae3073470a9348c669f032.

* set `tokenizer_file` in super `__init__` of mbart
---
 .../layoutxlm/tokenization_layoutxlm_fast.py  |  2 +-
 .../models/mbart/tokenization_mbart.py        |  2 +-
 src/transformers/tokenization_utils_base.py   | 48 ++++++++++---------
 src/transformers/tokenization_utils_fast.py   |  4 +-
 ...on_{{cookiecutter.lowercase_modelname}}.py |  5 +-
 tests/test_tokenization_common.py             | 44 +++++++++++++++--
 6 files changed, 72 insertions(+), 33 deletions(-)

diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
index 47dd362505..ca06657958 100644
--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
@@ -34,7 +34,7 @@ from ...tokenization_utils_base import (
 )
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
 from ...utils import logging
-from ..xlm_roberta.tokenization_xlm_roberta import (
+from ..xlm_roberta.tokenization_xlm_roberta_fast import (
     PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,
     PRETRAINED_VOCAB_FILES_MAP,
     VOCAB_FILES_NAMES,
diff --git a/src/transformers/models/mbart/tokenization_mbart.py b/src/transformers/models/mbart/tokenization_mbart.py
index e6d3ff4337..d6ea6260ae 100644
--- a/src/transformers/models/mbart/tokenization_mbart.py
+++ b/src/transformers/models/mbart/tokenization_mbart.py
@@ -110,7 +110,7 @@ class MBartTokenizer(PreTrainedTokenizer):
             cls_token=cls_token,
             pad_token=pad_token,
             mask_token=mask_token,
-            tokenizer_file=tokenizer_file,
+            tokenizer_file=None,
             src_lang=src_lang,
             tgt_lang=tgt_lang,
             additional_special_tokens=additional_special_tokens,
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index ebd83e4214..ed1fc2f10e 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1649,34 +1649,36 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
             vocab_files[file_id] = pretrained_model_name_or_path
         else:
             # At this point pretrained_model_name_or_path is either a directory or a model identifier name
-
-            # Try to get the tokenizer config to see if there are versioned tokenizer files.
-            fast_tokenizer_file = FULL_TOKENIZER_FILE
-            resolved_config_file = get_file_from_repo(
-                pretrained_model_name_or_path,
-                TOKENIZER_CONFIG_FILE,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                resume_download=resume_download,
-                proxies=proxies,
-                use_auth_token=use_auth_token,
-                revision=revision,
-                local_files_only=local_files_only,
-            )
-            if resolved_config_file is not None:
-                with open(resolved_config_file, encoding="utf-8") as reader:
-                    tokenizer_config = json.load(reader)
-                    if "fast_tokenizer_files" in tokenizer_config:
-                        fast_tokenizer_file = get_fast_tokenizer_file(tokenizer_config["fast_tokenizer_files"])
-
             additional_files_names = {
                 "added_tokens_file": ADDED_TOKENS_FILE,
                 "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
                 "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
-                "tokenizer_file": fast_tokenizer_file,
             }
+            vocab_files_target = {**cls.vocab_files_names, **additional_files_names}
+
+            if "tokenizer_file" in vocab_files_target:
+                # Try to get the tokenizer config to see if there are versioned tokenizer files.
+                fast_tokenizer_file = FULL_TOKENIZER_FILE
+                resolved_config_file = get_file_from_repo(
+                    pretrained_model_name_or_path,
+                    TOKENIZER_CONFIG_FILE,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    use_auth_token=use_auth_token,
+                    revision=revision,
+                    local_files_only=local_files_only,
+                )
+                if resolved_config_file is not None:
+                    with open(resolved_config_file, encoding="utf-8") as reader:
+                        tokenizer_config = json.load(reader)
+                        if "fast_tokenizer_files" in tokenizer_config:
+                            fast_tokenizer_file = get_fast_tokenizer_file(tokenizer_config["fast_tokenizer_files"])
+                vocab_files_target["tokenizer_file"] = fast_tokenizer_file
+
             # Look for the tokenizer files
-            for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items():
+            for file_id, file_name in vocab_files_target.items():
                 if os.path.isdir(pretrained_model_name_or_path):
                     if subfolder is not None:
                         full_file_name = os.path.join(pretrained_model_name_or_path, subfolder, file_name)
@@ -1758,7 +1760,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
                 f"Can't load tokenizer for '{pretrained_model_name_or_path}'. If you were trying to load it from "
                 "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
                 f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
-                f"containing all relevant tokenizer files."
+                f"containing all relevant files for a {cls.__name__} tokenizer."
             )
 
         for file_id, file_path in vocab_files.items():
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index 3985150e5f..d14b02d11f 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -46,7 +46,6 @@ from .utils import logging
 
 logger = logging.get_logger(__name__)
 
-
 # Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
 TOKENIZER_FILE = "tokenizer.json"
 SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
@@ -71,6 +70,8 @@ MODEL_TO_TRAINER_MAPPING = {
     "WordPiece": WordPieceTrainer,
 }
 
+VOCAB_FILES_NAMES = {"tokenizer_file": TOKENIZER_FILE}
+
 
 @add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
 class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
@@ -86,6 +87,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
     specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
     """
 
+    vocab_files_names = VOCAB_FILES_NAMES
     slow_tokenizer_class: PreTrainedTokenizer = None
     can_save_slow_tokenizer: bool = True
 
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
index 9de18085a3..a3ad1dd7c9 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
@@ -62,7 +62,7 @@ from ..bart.tokenization_bart import BartTokenizer
 
 logger = logging.get_logger(__name__)
 
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
@@ -71,9 +71,6 @@ PRETRAINED_VOCAB_FILES_MAP = {
     "merges_file": {
         "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/merges.txt",
     },
-    "tokenizer_file": {
-        "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/tokenizer.json",
-    },
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 0cfbeb7f53..6e1650f260 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -3604,15 +3604,24 @@ class TokenizerTesterMixin:
                             AlbertTokenizer.from_pretrained(pretrained_name)
                         else:
                             BertTokenizer.from_pretrained(pretrained_name)
+                    except EnvironmentError as e:
+                        # Some tokenizer will raised an error before reaching the logged warning because there are no
+                        # corresponding files to load
+                        error_message = str(e)
                     except (TypeError, AttributeError):
                         # Some tokenizers cannot be loaded into the target tokenizer at all and errors are returned,
                         # here we just check that the warning has been logged before the error is raised
                         pass
                     finally:
+                        logged_msg_target = (
+                            "The tokenizer class you load from this checkpoint is not the same type as the class "
+                            "this function is called from."
+                        )
+                        raised_error_msg_target = "Can't load tokenizer for"
                         self.assertTrue(
-                            cm.records[0].message.startswith(
-                                "The tokenizer class you load from this checkpoint is not the same type as the class this function is called from."
-                            )
+                            cm.records[0].message.startswith(logged_msg_target)
+                            if len(cm.records) > 0
+                            else False or raised_error_msg_target in error_message
                         )
                     try:
                         if self.rust_tokenizer_class == BertTokenizerFast:
@@ -3651,6 +3660,35 @@ class TokenizerTesterMixin:
                     trainer.save_model(os.path.join(tmp_dir, "checkpoint"))
                     self.assertIn("tokenizer.json", os.listdir(os.path.join(tmp_dir, "checkpoint")))
 
+    def test_save_slow_from_fast_and_reload_fast(self):
+        if not self.test_slow_tokenizer or not self.test_rust_tokenizer:
+            # we need both slow and fast versions
+            return
+
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                with tempfile.TemporaryDirectory() as tmp_dir_1:
+                    # Here we check that even if we have initialized a fast tokenizer with a tokenizer_file we can
+                    # still save only the slow version and use these saved files to rebuild a tokenizer
+                    tokenizer_fast_old_1 = self.rust_tokenizer_class.from_pretrained(
+                        pretrained_name, **kwargs, use_fast=True
+                    )
+                    tokenizer_file = os.path.join(tmp_dir_1, "tokenizer.json")
+                    tokenizer_fast_old_1.backend_tokenizer.save(tokenizer_file)
+
+                    tokenizer_fast_old_2 = self.rust_tokenizer_class.from_pretrained(
+                        pretrained_name, **kwargs, use_fast=True, tokenizer_file=tokenizer_file
+                    )
+
+                    tokenizer_fast_old_2.save_pretrained(tmp_dir_1, legacy_format=True)  # save only slow version
+
+                    tokenizer_slow = self.tokenizer_class.from_pretrained(tmp_dir_1)
+                with tempfile.TemporaryDirectory() as tmp_dir_2:
+                    tokenizer_slow.save_pretrained(tmp_dir_2)
+
+                    # Should not raise an error
+                    self.rust_tokenizer_class.from_pretrained(tmp_dir_2)
+
 
 class FakeTokenizer(BertTokenizer):
     pass