Model versioning (#8324)

* fix typo * rm use_cdn & references, and implement new hf_bucket_url * I'm pretty sure we don't need to `read` this file * same here * [BIG] file_utils.networking: do not gobble up errors anymore * Fix CI 😇 * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Tiny doc tweak * Add doc + pass kwarg everywhere * Add more tests and explain cc @sshleifer let me know if better Co-Authored-By: Sam Shleifer <sshleifer@gmail.com> * Also implement revision in pipelines In the case where we're passing a task name or a string model identifier * Fix CI 😇 * Fix CI * [hf_api] new methods + command line implem * make style * Final endpoints post-migration * Fix post-migration * Py3.6 compat cc @stefan-it Thank you @stas00 Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Sam Shleifer <sshleifer@gmail.com>
2020-11-10 13:11:02 +01:00
parent 4185b115d4
commit 70f622fab4
23 changed files with 472 additions and 210 deletions
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -29,6 +29,8 @@ from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union

 import numpy as np

+import requests
+
 from .file_utils import (
    add_end_docstrings,
    cached_path,
@@ -1515,6 +1517,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
            proxies (:obj:`Dict[str, str], `optional`):
                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+                identifier allowed by git.
            inputs (additional positional arguments, `optional`):
                Will be passed along to the Tokenizer ``__init__`` method.
            kwargs (additional keyword arguments, `optional`):
@@ -1549,6 +1555,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        resume_download = kwargs.pop("resume_download", False)
        proxies = kwargs.pop("proxies", None)
        local_files_only = kwargs.pop("local_files_only", False)
+        revision = kwargs.pop("revision", None)

        s3_models = list(cls.max_model_input_sizes.keys())
        vocab_files = {}
@@ -1601,18 +1608,18 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
                            full_file_name = None
                    else:
                        full_file_name = hf_bucket_url(
-                            pretrained_model_name_or_path, filename=file_name, use_cdn=False, mirror=None
+                            pretrained_model_name_or_path, filename=file_name, revision=revision, mirror=None
                        )

                    vocab_files[file_id] = full_file_name

        # Get files from url, cache, or disk depending on the case
-        try:
-            resolved_vocab_files = {}
-            for file_id, file_path in vocab_files.items():
-                if file_path is None:
-                    resolved_vocab_files[file_id] = None
-                else:
+        resolved_vocab_files = {}
+        for file_id, file_path in vocab_files.items():
+            if file_path is None:
+                resolved_vocab_files[file_id] = None
+            else:
+                try:
                    resolved_vocab_files[file_id] = cached_path(
                        file_path,
                        cache_dir=cache_dir,
@@ -1621,34 +1628,20 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
                        resume_download=resume_download,
                        local_files_only=local_files_only,
                    )
-        except EnvironmentError:
-            if pretrained_model_name_or_path in s3_models:
-                msg = "Couldn't reach server at '{}' to download vocabulary files."
-            else:
-                msg = (
-                    "Model name '{}' was not found in tokenizers model name list ({}). "
-                    "We assumed '{}' was a path or url to a directory containing vocabulary files "
-                    "named {}, but couldn't find such vocabulary files at this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ", ".join(s3_models),
-                        pretrained_model_name_or_path,
-                        list(cls.vocab_files_names.values()),
-                    )
-                )
-
-            raise EnvironmentError(msg)
+                except requests.exceptions.HTTPError as err:
+                    if "404 Client Error" in str(err):
+                        logger.debug(err)
+                        resolved_vocab_files[file_id] = None
+                    else:
+                        raise err

        if all(full_file_name is None for full_file_name in resolved_vocab_files.values()):
-            raise EnvironmentError(
-                "Model name '{}' was not found in tokenizers model name list ({}). "
-                "We assumed '{}' was a path, a model identifier, or url to a directory containing vocabulary files "
-                "named {} but couldn't find such vocabulary files at this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ", ".join(s3_models),
-                    pretrained_model_name_or_path,
-                    list(cls.vocab_files_names.values()),
-                )
+            msg = (
+                f"Can't load tokenizer for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
+                f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
+                f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing relevant tokenizer files\n\n"
            )
+            raise EnvironmentError(msg)

        for file_id, file_path in vocab_files.items():
            if file_path == resolved_vocab_files[file_id]: