From 3e4900208a21eefaf83febd8e4022154eb0e3174 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Fri, 14 Oct 2022 14:06:56 -0400 Subject: [PATCH] Tokenizer from_pretrained should not use local files named like tokenizer files (#19626) --- src/transformers/tokenization_utils_base.py | 11 +++++++---- tests/test_tokenization_common.py | 16 ++++++++++++++++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 54d562136d..b37ed59ef3 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1670,6 +1670,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): init_configuration = {} is_local = os.path.isdir(pretrained_model_name_or_path) + single_file_id = None if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): if len(cls.vocab_files_names) > 1: raise ValueError( @@ -1684,6 +1685,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): file_id = list(cls.vocab_files_names.keys())[0] vocab_files[file_id] = pretrained_model_name_or_path + single_file_id = file_id else: # At this point pretrained_model_name_or_path is either a directory or a model identifier name additional_files_names = { @@ -1726,10 +1728,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): for file_id, file_path in vocab_files.items(): if file_path is None: resolved_vocab_files[file_id] = None - elif os.path.isfile(file_path): - resolved_vocab_files[file_id] = file_path - elif is_remote_url(file_path): - resolved_vocab_files[file_id] = download_url(file_path, proxies=proxies) + elif single_file_id == file_id: + if os.path.isfile(file_path): + resolved_vocab_files[file_id] = file_path + elif is_remote_url(file_path): + resolved_vocab_files[file_id] = download_url(file_path, proxies=proxies) else: resolved_vocab_files[file_id] = cached_file( pretrained_model_name_or_path, diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 537f5fb9bd..9f14e4122b 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -3920,6 +3920,22 @@ class TokenizerUtilTester(unittest.TestCase): finally: os.remove(tmp_file) + # Supporting this legacy load introduced a weird bug where the tokenizer would load local files if they are in + # the current folder and have the right name. + if os.path.isfile("tokenizer.json"): + # We skip the test if the user has a `tokenizer.json` in this folder to avoid deleting it. + return + try: + with open("tokenizer.json", "wb") as f: + http_get("https://huggingface.co/hf-internal-testing/tiny-random-bert/blob/main/tokenizer.json", f) + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2") + # The tiny random BERT has a vocab size of 1024, tiny gpt2 as a vocab size of 1000 + self.assertEqual(tokenizer.vocab_size, 1000) + # Tokenizer should depend on the remote checkpoint, not the local tokenizer.json file. + + finally: + os.remove("tokenizer.json") + def test_legacy_load_from_url(self): # This test is for deprecated behavior and can be removed in v5 _ = AlbertTokenizer.from_pretrained("https://huggingface.co/albert-base-v1/resolve/main/spiece.model")