Model versioning (#8324)
* fix typo * rm use_cdn & references, and implement new hf_bucket_url * I'm pretty sure we don't need to `read` this file * same here * [BIG] file_utils.networking: do not gobble up errors anymore * Fix CI 😇 * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Tiny doc tweak * Add doc + pass kwarg everywhere * Add more tests and explain cc @sshleifer let me know if better Co-Authored-By: Sam Shleifer <sshleifer@gmail.com> * Also implement revision in pipelines In the case where we're passing a task name or a string model identifier * Fix CI 😇 * Fix CI * [hf_api] new methods + command line implem * make style * Final endpoints post-migration * Fix post-migration * Py3.6 compat cc @stefan-it Thank you @stas00 Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Sam Shleifer <sshleifer@gmail.com>
This commit is contained in:
@@ -29,6 +29,8 @@ from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
import requests
|
||||
|
||||
from .file_utils import (
|
||||
add_end_docstrings,
|
||||
cached_path,
|
||||
@@ -1515,6 +1517,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
proxies (:obj:`Dict[str, str], `optional`):
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
|
||||
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
|
||||
revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
|
||||
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
|
||||
git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
|
||||
identifier allowed by git.
|
||||
inputs (additional positional arguments, `optional`):
|
||||
Will be passed along to the Tokenizer ``__init__`` method.
|
||||
kwargs (additional keyword arguments, `optional`):
|
||||
@@ -1549,6 +1555,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
resume_download = kwargs.pop("resume_download", False)
|
||||
proxies = kwargs.pop("proxies", None)
|
||||
local_files_only = kwargs.pop("local_files_only", False)
|
||||
revision = kwargs.pop("revision", None)
|
||||
|
||||
s3_models = list(cls.max_model_input_sizes.keys())
|
||||
vocab_files = {}
|
||||
@@ -1601,18 +1608,18 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
full_file_name = None
|
||||
else:
|
||||
full_file_name = hf_bucket_url(
|
||||
pretrained_model_name_or_path, filename=file_name, use_cdn=False, mirror=None
|
||||
pretrained_model_name_or_path, filename=file_name, revision=revision, mirror=None
|
||||
)
|
||||
|
||||
vocab_files[file_id] = full_file_name
|
||||
|
||||
# Get files from url, cache, or disk depending on the case
|
||||
try:
|
||||
resolved_vocab_files = {}
|
||||
for file_id, file_path in vocab_files.items():
|
||||
if file_path is None:
|
||||
resolved_vocab_files[file_id] = None
|
||||
else:
|
||||
resolved_vocab_files = {}
|
||||
for file_id, file_path in vocab_files.items():
|
||||
if file_path is None:
|
||||
resolved_vocab_files[file_id] = None
|
||||
else:
|
||||
try:
|
||||
resolved_vocab_files[file_id] = cached_path(
|
||||
file_path,
|
||||
cache_dir=cache_dir,
|
||||
@@ -1621,34 +1628,20 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
resume_download=resume_download,
|
||||
local_files_only=local_files_only,
|
||||
)
|
||||
except EnvironmentError:
|
||||
if pretrained_model_name_or_path in s3_models:
|
||||
msg = "Couldn't reach server at '{}' to download vocabulary files."
|
||||
else:
|
||||
msg = (
|
||||
"Model name '{}' was not found in tokenizers model name list ({}). "
|
||||
"We assumed '{}' was a path or url to a directory containing vocabulary files "
|
||||
"named {}, but couldn't find such vocabulary files at this path or url.".format(
|
||||
pretrained_model_name_or_path,
|
||||
", ".join(s3_models),
|
||||
pretrained_model_name_or_path,
|
||||
list(cls.vocab_files_names.values()),
|
||||
)
|
||||
)
|
||||
|
||||
raise EnvironmentError(msg)
|
||||
except requests.exceptions.HTTPError as err:
|
||||
if "404 Client Error" in str(err):
|
||||
logger.debug(err)
|
||||
resolved_vocab_files[file_id] = None
|
||||
else:
|
||||
raise err
|
||||
|
||||
if all(full_file_name is None for full_file_name in resolved_vocab_files.values()):
|
||||
raise EnvironmentError(
|
||||
"Model name '{}' was not found in tokenizers model name list ({}). "
|
||||
"We assumed '{}' was a path, a model identifier, or url to a directory containing vocabulary files "
|
||||
"named {} but couldn't find such vocabulary files at this path or url.".format(
|
||||
pretrained_model_name_or_path,
|
||||
", ".join(s3_models),
|
||||
pretrained_model_name_or_path,
|
||||
list(cls.vocab_files_names.values()),
|
||||
)
|
||||
msg = (
|
||||
f"Can't load tokenizer for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
|
||||
f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
|
||||
f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing relevant tokenizer files\n\n"
|
||||
)
|
||||
raise EnvironmentError(msg)
|
||||
|
||||
for file_id, file_path in vocab_files.items():
|
||||
if file_path == resolved_vocab_files[file_id]:
|
||||
|
||||
Reference in New Issue
Block a user