Refine errors for pretrained objects (#15261)

* Refine errors for pretrained objects * PoC to avoid using get_list_of_files * Adapt tests to use new errors * Quality + Fix PoC * Revert "PoC to avoid using get_list_of_files" This reverts commit cb93b7cae8504ef837c2a7663cb7955e714f323e. * Revert "Quality + Fix PoC" This reverts commit 3ba6d0d4ca546708b31d355baa9e68ba9736508f. * Fix doc * Revert PoC * Add feature extractors * More tests and PT model * Adapt error message * Feature extractor tests * TF model * Flax model and test * Merge flax auto tests * Add tokenization * Fix test
2022-01-21 15:00:09 -05:00
parent 80af1048cf
commit 6ac77534bf
16 changed files with 603 additions and 103 deletions
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -31,13 +31,16 @@ from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequenc
 import numpy as np
 from packaging import version

-import requests
+from requests import HTTPError

 from . import __version__
 from .file_utils import (
+    EntryNotFoundError,
    ExplicitEnum,
    PaddingStrategy,
    PushToHubMixin,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
    TensorType,
    _is_jax,
    _is_numpy,
@@ -1704,9 +1707,28 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
                    else:
                        raise error

-                except requests.exceptions.HTTPError as err:
+                except RepositoryNotFoundError as err:
+                    logger.error(err)
+                    raise EnvironmentError(
+                        f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
+                        "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to "
+                        "pass a token having permission to this repo with `use_auth_token` or log in with "
+                        "`huggingface-cli login` and pass `use_auth_token=True`."
+                    )
+                except RevisionNotFoundError as err:
+                    logger.error(err)
+                    raise EnvironmentError(
+                        f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists "
+                        "for this model name. Check the model page at "
+                        f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
+                    )
+                except EntryNotFoundError:
+                    logger.debug(f"{pretrained_model_name_or_path} does not contain a file named {file_path}.")
+                    resolved_vocab_files[file_id] = None
+
+                except HTTPError as err:
                    if "404 Client Error" in str(err):
-                        logger.debug(err)
+                        logger.debug(f"Connection problem to access {file_path}.")
                        resolved_vocab_files[file_id] = None
                    else:
                        raise err
@@ -1718,18 +1740,13 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
            )

        if all(full_file_name is None for full_file_name in resolved_vocab_files.values()):
-            msg = (
-                f"Can't load tokenizer for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
-                f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n"
-                f"  (make sure '{pretrained_model_name_or_path}' is not a path to a local directory with something else, in that case)\n\n"
-                f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing relevant tokenizer files\n\n"
+            raise EnvironmentError(
+                f"Can't load tokenizer for '{pretrained_model_name_or_path}'. If you were trying to load it from "
+                "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
+                f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
+                f"containing all relevant tokenizer files."
            )

-            if revision is not None:
-                msg += f"- or '{revision}' is a valid git identifier (branch name, a tag name, or a commit id) that exists for this model name as listed on its model page on 'https://huggingface.co/models'\n\n"
-
-            raise EnvironmentError(msg)
-
        for file_id, file_path in vocab_files.items():
            if file_id not in resolved_vocab_files:
                continue
@@ -3504,9 +3521,13 @@ def get_fast_tokenizer_file(
        `str`: The tokenizer file to use.
    """
    # Inspect all files from the repo/folder.
-    all_files = get_list_of_files(
-        path_or_repo, revision=revision, use_auth_token=use_auth_token, local_files_only=local_files_only
-    )
+    try:
+        all_files = get_list_of_files(
+            path_or_repo, revision=revision, use_auth_token=use_auth_token, local_files_only=local_files_only
+        )
+    except Exception:
+        return FULL_TOKENIZER_FILE
+
    tokenizer_files_map = {}
    for file_name in all_files:
        search = _re_tokenizer_file.search(file_name)