Use new huggingface_hub tools for download models (#18438)
* Draft new cached_file * Initial draft for config and model * Small fixes * Fix first batch of tests * Look in cache when internet is down * Fix last tests * Bad black, not fixing all quality errors * Make diff less * Implement change for TF and Flax models * Add tokenizer and feature extractor * For compatibility with main * Add utils to move the cache and auto-do it at first use. * Quality * Deal with empty commit shas * Deal with empty etag * Address review comments
This commit is contained in:
@@ -35,21 +35,16 @@ from packaging import version
|
||||
from . import __version__
|
||||
from .dynamic_module_utils import custom_object_save
|
||||
from .utils import (
|
||||
EntryNotFoundError,
|
||||
ExplicitEnum,
|
||||
PaddingStrategy,
|
||||
PushToHubMixin,
|
||||
RepositoryNotFoundError,
|
||||
RevisionNotFoundError,
|
||||
TensorType,
|
||||
add_end_docstrings,
|
||||
cached_path,
|
||||
cached_file,
|
||||
copy_func,
|
||||
get_file_from_repo,
|
||||
hf_bucket_url,
|
||||
is_flax_available,
|
||||
is_offline_mode,
|
||||
is_remote_url,
|
||||
is_tf_available,
|
||||
is_tokenizers_available,
|
||||
is_torch_available,
|
||||
@@ -1669,7 +1664,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
vocab_files = {}
|
||||
init_configuration = {}
|
||||
|
||||
if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
|
||||
is_local = os.path.isdir(pretrained_model_name_or_path)
|
||||
if os.path.isfile(pretrained_model_name_or_path):
|
||||
if len(cls.vocab_files_names) > 1:
|
||||
raise ValueError(
|
||||
f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is not "
|
||||
@@ -1689,9 +1685,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
"special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
|
||||
"tokenizer_config_file": TOKENIZER_CONFIG_FILE,
|
||||
}
|
||||
vocab_files_target = {**cls.vocab_files_names, **additional_files_names}
|
||||
vocab_files = {**cls.vocab_files_names, **additional_files_names}
|
||||
|
||||
if "tokenizer_file" in vocab_files_target:
|
||||
if "tokenizer_file" in vocab_files:
|
||||
# Try to get the tokenizer config to see if there are versioned tokenizer files.
|
||||
fast_tokenizer_file = FULL_TOKENIZER_FILE
|
||||
resolved_config_file = get_file_from_repo(
|
||||
@@ -1704,80 +1700,38 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
use_auth_token=use_auth_token,
|
||||
revision=revision,
|
||||
local_files_only=local_files_only,
|
||||
subfolder=subfolder,
|
||||
)
|
||||
if resolved_config_file is not None:
|
||||
with open(resolved_config_file, encoding="utf-8") as reader:
|
||||
tokenizer_config = json.load(reader)
|
||||
if "fast_tokenizer_files" in tokenizer_config:
|
||||
fast_tokenizer_file = get_fast_tokenizer_file(tokenizer_config["fast_tokenizer_files"])
|
||||
vocab_files_target["tokenizer_file"] = fast_tokenizer_file
|
||||
|
||||
# Look for the tokenizer files
|
||||
for file_id, file_name in vocab_files_target.items():
|
||||
if os.path.isdir(pretrained_model_name_or_path):
|
||||
if subfolder is not None:
|
||||
full_file_name = os.path.join(pretrained_model_name_or_path, subfolder, file_name)
|
||||
else:
|
||||
full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
|
||||
if not os.path.exists(full_file_name):
|
||||
logger.info(f"Didn't find file {full_file_name}. We won't load it.")
|
||||
full_file_name = None
|
||||
else:
|
||||
full_file_name = hf_bucket_url(
|
||||
pretrained_model_name_or_path,
|
||||
filename=file_name,
|
||||
subfolder=subfolder,
|
||||
revision=revision,
|
||||
mirror=None,
|
||||
)
|
||||
|
||||
vocab_files[file_id] = full_file_name
|
||||
vocab_files["tokenizer_file"] = fast_tokenizer_file
|
||||
|
||||
# Get files from url, cache, or disk depending on the case
|
||||
resolved_vocab_files = {}
|
||||
unresolved_files = []
|
||||
for file_id, file_path in vocab_files.items():
|
||||
print(file_id, file_path)
|
||||
if file_path is None:
|
||||
resolved_vocab_files[file_id] = None
|
||||
else:
|
||||
try:
|
||||
resolved_vocab_files[file_id] = cached_path(
|
||||
file_path,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
resume_download=resume_download,
|
||||
local_files_only=local_files_only,
|
||||
use_auth_token=use_auth_token,
|
||||
user_agent=user_agent,
|
||||
)
|
||||
|
||||
except FileNotFoundError as error:
|
||||
if local_files_only:
|
||||
unresolved_files.append(file_id)
|
||||
else:
|
||||
raise error
|
||||
|
||||
except RepositoryNotFoundError:
|
||||
raise EnvironmentError(
|
||||
f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
|
||||
"listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to "
|
||||
"pass a token having permission to this repo with `use_auth_token` or log in with "
|
||||
"`huggingface-cli login` and pass `use_auth_token=True`."
|
||||
)
|
||||
except RevisionNotFoundError:
|
||||
raise EnvironmentError(
|
||||
f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists "
|
||||
"for this model name. Check the model page at "
|
||||
f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
|
||||
)
|
||||
except EntryNotFoundError:
|
||||
logger.debug(f"{pretrained_model_name_or_path} does not contain a file named {file_path}.")
|
||||
resolved_vocab_files[file_id] = None
|
||||
|
||||
except ValueError:
|
||||
logger.debug(f"Connection problem to access {file_path} and it wasn't found in the cache.")
|
||||
resolved_vocab_files[file_id] = None
|
||||
resolved_vocab_files[file_id] = cached_file(
|
||||
pretrained_model_name_or_path,
|
||||
file_path,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
resume_download=resume_download,
|
||||
local_files_only=local_files_only,
|
||||
use_auth_token=use_auth_token,
|
||||
user_agent=user_agent,
|
||||
revision=revision,
|
||||
subfolder=subfolder,
|
||||
_raise_exceptions_for_missing_entries=False,
|
||||
_raise_exceptions_for_connection_errors=False,
|
||||
)
|
||||
|
||||
if len(unresolved_files) > 0:
|
||||
logger.info(
|
||||
@@ -1797,7 +1751,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
if file_id not in resolved_vocab_files:
|
||||
continue
|
||||
|
||||
if file_path == resolved_vocab_files[file_id]:
|
||||
if is_local:
|
||||
logger.info(f"loading file {file_path}")
|
||||
else:
|
||||
logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
|
||||
|
||||
Reference in New Issue
Block a user