fix the tokenizer_config.json file for the slow tokenizer when a fast version is available (#15319)
* add new test * update test * remove `tokenizer_file` from `additional_files_names` in `tokenization_utils_base.py` * add `tokenizer_file` for the fast only tokenizer * change global variables layoutxml * remove `"tokenizer_file"` from DPR tokenizer's Global variables * remove `tokenizer_file` from herbert slow tokenizer init * `"tokenizer_file"` from LED tokenizer's Global variables * remove `tokenizer_file` from mbart slow tokenizer init * remove `tokenizer_file` from slow tokenizer template * adapt to versioning * adapt the `test_tokenizer_mismatch_warning` test * clean test * clarify `VOCAB_FILES_NAMES` in tokenization_utils_fast.py * Revert "remove `tokenizer_file` from mbart slow tokenizer init" This reverts commit 0dbb723fa9c7599d4640fe30b3647a74eb4a64e1. * Revert "`"tokenizer_file"` from LED tokenizer's Global variables" This reverts commit 5a3f879bdd651233f3d74a3d1146c34cde82b0c2. * Revert "remove `tokenizer_file` from herbert slow tokenizer init" This reverts commit f5e10007b7b0ec5345e015b9de7ffec72c5407fd. * Revert "remove `"tokenizer_file"` from DPR tokenizer's Global variables" This reverts commit da0895330bedfafc81ae3073470a9348c669f032. * set `tokenizer_file` in super `__init__` of mbart
This commit is contained in:
@@ -34,7 +34,7 @@ from ...tokenization_utils_base import (
|
|||||||
)
|
)
|
||||||
from ...tokenization_utils_fast import PreTrainedTokenizerFast
|
from ...tokenization_utils_fast import PreTrainedTokenizerFast
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
from ..xlm_roberta.tokenization_xlm_roberta import (
|
from ..xlm_roberta.tokenization_xlm_roberta_fast import (
|
||||||
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,
|
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,
|
||||||
PRETRAINED_VOCAB_FILES_MAP,
|
PRETRAINED_VOCAB_FILES_MAP,
|
||||||
VOCAB_FILES_NAMES,
|
VOCAB_FILES_NAMES,
|
||||||
|
|||||||
@@ -110,7 +110,7 @@ class MBartTokenizer(PreTrainedTokenizer):
|
|||||||
cls_token=cls_token,
|
cls_token=cls_token,
|
||||||
pad_token=pad_token,
|
pad_token=pad_token,
|
||||||
mask_token=mask_token,
|
mask_token=mask_token,
|
||||||
tokenizer_file=tokenizer_file,
|
tokenizer_file=None,
|
||||||
src_lang=src_lang,
|
src_lang=src_lang,
|
||||||
tgt_lang=tgt_lang,
|
tgt_lang=tgt_lang,
|
||||||
additional_special_tokens=additional_special_tokens,
|
additional_special_tokens=additional_special_tokens,
|
||||||
|
|||||||
@@ -1649,7 +1649,14 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
|||||||
vocab_files[file_id] = pretrained_model_name_or_path
|
vocab_files[file_id] = pretrained_model_name_or_path
|
||||||
else:
|
else:
|
||||||
# At this point pretrained_model_name_or_path is either a directory or a model identifier name
|
# At this point pretrained_model_name_or_path is either a directory or a model identifier name
|
||||||
|
additional_files_names = {
|
||||||
|
"added_tokens_file": ADDED_TOKENS_FILE,
|
||||||
|
"special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
|
||||||
|
"tokenizer_config_file": TOKENIZER_CONFIG_FILE,
|
||||||
|
}
|
||||||
|
vocab_files_target = {**cls.vocab_files_names, **additional_files_names}
|
||||||
|
|
||||||
|
if "tokenizer_file" in vocab_files_target:
|
||||||
# Try to get the tokenizer config to see if there are versioned tokenizer files.
|
# Try to get the tokenizer config to see if there are versioned tokenizer files.
|
||||||
fast_tokenizer_file = FULL_TOKENIZER_FILE
|
fast_tokenizer_file = FULL_TOKENIZER_FILE
|
||||||
resolved_config_file = get_file_from_repo(
|
resolved_config_file = get_file_from_repo(
|
||||||
@@ -1668,15 +1675,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
|||||||
tokenizer_config = json.load(reader)
|
tokenizer_config = json.load(reader)
|
||||||
if "fast_tokenizer_files" in tokenizer_config:
|
if "fast_tokenizer_files" in tokenizer_config:
|
||||||
fast_tokenizer_file = get_fast_tokenizer_file(tokenizer_config["fast_tokenizer_files"])
|
fast_tokenizer_file = get_fast_tokenizer_file(tokenizer_config["fast_tokenizer_files"])
|
||||||
|
vocab_files_target["tokenizer_file"] = fast_tokenizer_file
|
||||||
|
|
||||||
additional_files_names = {
|
|
||||||
"added_tokens_file": ADDED_TOKENS_FILE,
|
|
||||||
"special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
|
|
||||||
"tokenizer_config_file": TOKENIZER_CONFIG_FILE,
|
|
||||||
"tokenizer_file": fast_tokenizer_file,
|
|
||||||
}
|
|
||||||
# Look for the tokenizer files
|
# Look for the tokenizer files
|
||||||
for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items():
|
for file_id, file_name in vocab_files_target.items():
|
||||||
if os.path.isdir(pretrained_model_name_or_path):
|
if os.path.isdir(pretrained_model_name_or_path):
|
||||||
if subfolder is not None:
|
if subfolder is not None:
|
||||||
full_file_name = os.path.join(pretrained_model_name_or_path, subfolder, file_name)
|
full_file_name = os.path.join(pretrained_model_name_or_path, subfolder, file_name)
|
||||||
@@ -1758,7 +1760,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
|||||||
f"Can't load tokenizer for '{pretrained_model_name_or_path}'. If you were trying to load it from "
|
f"Can't load tokenizer for '{pretrained_model_name_or_path}'. If you were trying to load it from "
|
||||||
"'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
|
"'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
|
||||||
f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
|
f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
|
||||||
f"containing all relevant tokenizer files."
|
f"containing all relevant files for a {cls.__name__} tokenizer."
|
||||||
)
|
)
|
||||||
|
|
||||||
for file_id, file_path in vocab_files.items():
|
for file_id, file_path in vocab_files.items():
|
||||||
|
|||||||
@@ -46,7 +46,6 @@ from .utils import logging
|
|||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
|
# Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
|
||||||
TOKENIZER_FILE = "tokenizer.json"
|
TOKENIZER_FILE = "tokenizer.json"
|
||||||
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
|
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
|
||||||
@@ -71,6 +70,8 @@ MODEL_TO_TRAINER_MAPPING = {
|
|||||||
"WordPiece": WordPieceTrainer,
|
"WordPiece": WordPieceTrainer,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
VOCAB_FILES_NAMES = {"tokenizer_file": TOKENIZER_FILE}
|
||||||
|
|
||||||
|
|
||||||
@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
|
@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
|
||||||
class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
||||||
@@ -86,6 +87,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
|||||||
specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
|
specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
slow_tokenizer_class: PreTrainedTokenizer = None
|
slow_tokenizer_class: PreTrainedTokenizer = None
|
||||||
can_save_slow_tokenizer: bool = True
|
can_save_slow_tokenizer: bool = True
|
||||||
|
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ from ..bart.tokenization_bart import BartTokenizer
|
|||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
|
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
|
||||||
|
|
||||||
PRETRAINED_VOCAB_FILES_MAP = {
|
PRETRAINED_VOCAB_FILES_MAP = {
|
||||||
"vocab_file": {
|
"vocab_file": {
|
||||||
@@ -71,9 +71,6 @@ PRETRAINED_VOCAB_FILES_MAP = {
|
|||||||
"merges_file": {
|
"merges_file": {
|
||||||
"{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/merges.txt",
|
"{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/merges.txt",
|
||||||
},
|
},
|
||||||
"tokenizer_file": {
|
|
||||||
"{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/tokenizer.json",
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||||
|
|||||||
@@ -3604,15 +3604,24 @@ class TokenizerTesterMixin:
|
|||||||
AlbertTokenizer.from_pretrained(pretrained_name)
|
AlbertTokenizer.from_pretrained(pretrained_name)
|
||||||
else:
|
else:
|
||||||
BertTokenizer.from_pretrained(pretrained_name)
|
BertTokenizer.from_pretrained(pretrained_name)
|
||||||
|
except EnvironmentError as e:
|
||||||
|
# Some tokenizer will raised an error before reaching the logged warning because there are no
|
||||||
|
# corresponding files to load
|
||||||
|
error_message = str(e)
|
||||||
except (TypeError, AttributeError):
|
except (TypeError, AttributeError):
|
||||||
# Some tokenizers cannot be loaded into the target tokenizer at all and errors are returned,
|
# Some tokenizers cannot be loaded into the target tokenizer at all and errors are returned,
|
||||||
# here we just check that the warning has been logged before the error is raised
|
# here we just check that the warning has been logged before the error is raised
|
||||||
pass
|
pass
|
||||||
finally:
|
finally:
|
||||||
self.assertTrue(
|
logged_msg_target = (
|
||||||
cm.records[0].message.startswith(
|
"The tokenizer class you load from this checkpoint is not the same type as the class "
|
||||||
"The tokenizer class you load from this checkpoint is not the same type as the class this function is called from."
|
"this function is called from."
|
||||||
)
|
)
|
||||||
|
raised_error_msg_target = "Can't load tokenizer for"
|
||||||
|
self.assertTrue(
|
||||||
|
cm.records[0].message.startswith(logged_msg_target)
|
||||||
|
if len(cm.records) > 0
|
||||||
|
else False or raised_error_msg_target in error_message
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
if self.rust_tokenizer_class == BertTokenizerFast:
|
if self.rust_tokenizer_class == BertTokenizerFast:
|
||||||
@@ -3651,6 +3660,35 @@ class TokenizerTesterMixin:
|
|||||||
trainer.save_model(os.path.join(tmp_dir, "checkpoint"))
|
trainer.save_model(os.path.join(tmp_dir, "checkpoint"))
|
||||||
self.assertIn("tokenizer.json", os.listdir(os.path.join(tmp_dir, "checkpoint")))
|
self.assertIn("tokenizer.json", os.listdir(os.path.join(tmp_dir, "checkpoint")))
|
||||||
|
|
||||||
|
def test_save_slow_from_fast_and_reload_fast(self):
|
||||||
|
if not self.test_slow_tokenizer or not self.test_rust_tokenizer:
|
||||||
|
# we need both slow and fast versions
|
||||||
|
return
|
||||||
|
|
||||||
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
|
with tempfile.TemporaryDirectory() as tmp_dir_1:
|
||||||
|
# Here we check that even if we have initialized a fast tokenizer with a tokenizer_file we can
|
||||||
|
# still save only the slow version and use these saved files to rebuild a tokenizer
|
||||||
|
tokenizer_fast_old_1 = self.rust_tokenizer_class.from_pretrained(
|
||||||
|
pretrained_name, **kwargs, use_fast=True
|
||||||
|
)
|
||||||
|
tokenizer_file = os.path.join(tmp_dir_1, "tokenizer.json")
|
||||||
|
tokenizer_fast_old_1.backend_tokenizer.save(tokenizer_file)
|
||||||
|
|
||||||
|
tokenizer_fast_old_2 = self.rust_tokenizer_class.from_pretrained(
|
||||||
|
pretrained_name, **kwargs, use_fast=True, tokenizer_file=tokenizer_file
|
||||||
|
)
|
||||||
|
|
||||||
|
tokenizer_fast_old_2.save_pretrained(tmp_dir_1, legacy_format=True) # save only slow version
|
||||||
|
|
||||||
|
tokenizer_slow = self.tokenizer_class.from_pretrained(tmp_dir_1)
|
||||||
|
with tempfile.TemporaryDirectory() as tmp_dir_2:
|
||||||
|
tokenizer_slow.save_pretrained(tmp_dir_2)
|
||||||
|
|
||||||
|
# Should not raise an error
|
||||||
|
self.rust_tokenizer_class.from_pretrained(tmp_dir_2)
|
||||||
|
|
||||||
|
|
||||||
class FakeTokenizer(BertTokenizer):
|
class FakeTokenizer(BertTokenizer):
|
||||||
pass
|
pass
|
||||||
|
|||||||
Reference in New Issue
Block a user