Tokenizer from_pretrained should not use local files named like tokenizer files (#19626)

This commit is contained in:
Sylvain Gugger
2022-10-14 14:06:56 -04:00
committed by GitHub
parent 8fcf562603
commit 3e4900208a
2 changed files with 23 additions and 4 deletions

View File

@@ -1670,6 +1670,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
init_configuration = {}
is_local = os.path.isdir(pretrained_model_name_or_path)
single_file_id = None
if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
if len(cls.vocab_files_names) > 1:
raise ValueError(
@@ -1684,6 +1685,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
file_id = list(cls.vocab_files_names.keys())[0]
vocab_files[file_id] = pretrained_model_name_or_path
single_file_id = file_id
else:
# At this point pretrained_model_name_or_path is either a directory or a model identifier name
additional_files_names = {
@@ -1726,10 +1728,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
for file_id, file_path in vocab_files.items():
if file_path is None:
resolved_vocab_files[file_id] = None
elif os.path.isfile(file_path):
resolved_vocab_files[file_id] = file_path
elif is_remote_url(file_path):
resolved_vocab_files[file_id] = download_url(file_path, proxies=proxies)
elif single_file_id == file_id:
if os.path.isfile(file_path):
resolved_vocab_files[file_id] = file_path
elif is_remote_url(file_path):
resolved_vocab_files[file_id] = download_url(file_path, proxies=proxies)
else:
resolved_vocab_files[file_id] = cached_file(
pretrained_model_name_or_path,