Tokenizer from_pretrained should not use local files named like tokenizer files (#19626)

This commit is contained in:
Sylvain Gugger
2022-10-14 14:06:56 -04:00
committed by GitHub
parent 8fcf562603
commit 3e4900208a
2 changed files with 23 additions and 4 deletions

View File

@@ -3920,6 +3920,22 @@ class TokenizerUtilTester(unittest.TestCase):
finally:
os.remove(tmp_file)
# Supporting this legacy load introduced a weird bug where the tokenizer would load local files if they are in
# the current folder and have the right name.
if os.path.isfile("tokenizer.json"):
# We skip the test if the user has a `tokenizer.json` in this folder to avoid deleting it.
return
try:
with open("tokenizer.json", "wb") as f:
http_get("https://huggingface.co/hf-internal-testing/tiny-random-bert/blob/main/tokenizer.json", f)
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
# The tiny random BERT has a vocab size of 1024, tiny gpt2 as a vocab size of 1000
self.assertEqual(tokenizer.vocab_size, 1000)
# Tokenizer should depend on the remote checkpoint, not the local tokenizer.json file.
finally:
os.remove("tokenizer.json")
def test_legacy_load_from_url(self):
# This test is for deprecated behavior and can be removed in v5
_ = AlbertTokenizer.from_pretrained("https://huggingface.co/albert-base-v1/resolve/main/spiece.model")