Remove special treatment for custom vocab files (#10637)

* Remove special path for custom vocab files * Update src/transformers/tokenization_utils_base.py Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> * Expand error message Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
2021-03-11 11:11:56 -05:00
parent 6d9e11a193
commit 89693e170d
1 changed files with 55 additions and 73 deletions
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1601,38 +1601,20 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
            logger.info("Offline mode: forcing local_files_only=True")
            local_files_only = True
        s3_models = list(cls.max_model_input_sizes.keys())
        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
        vocab_files = {}
        init_configuration = {}
        if pretrained_model_name_or_path in s3_models:
            # Get the vocabulary from AWS S3 bucket
            for file_id, map_list in cls.pretrained_vocab_files_map.items():
                vocab_files[file_id] = map_list[pretrained_model_name_or_path]
            if (
                cls.pretrained_init_configuration
                and pretrained_model_name_or_path in cls.pretrained_init_configuration
            ):
                init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path].copy()
        else:
            # Get the vocabulary from local files
            logger.info(
                "Model name '{}' not found in model shortcut name list ({}). "
                "Assuming '{}' is a path, a model identifier, or url to a directory containing tokenizer files.".format(
                    pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path
                )
            )
        if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
            if len(cls.vocab_files_names) > 1:
                raise ValueError(
-                        "Calling {}.from_pretrained() with the path to a single file or url is not supported."
+                    f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is not "
-                        "Use a model identifier or the path to a directory instead.".format(cls.__name__)
+                    "supported for this tokenizer. Use a model identifier or the path to a directory instead."
                    )
                logger.warning(
                    "Calling {}.from_pretrained() with the path to a single file or url is deprecated".format(
                        cls.__name__
                )
            warnings.warn(
                f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is deprecated and "
                "won't be possible anymore in v5. Use a model identifier or the path to a directory instead.",
                FutureWarning,
            )
            file_id = list(cls.vocab_files_names.keys())[0]
            vocab_files[file_id] = pretrained_model_name_or_path
@@ -1652,7 +1634,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
                    else:
                        full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
                    if not os.path.exists(full_file_name):
-                            logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
+                        logger.info(f"Didn't find file {full_file_name}. We won't load it.")
                        full_file_name = None
                else:
                    full_file_name = hf_bucket_url(
@@ -1672,7 +1654,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
            if file_path is None:
                resolved_vocab_files[file_id] = None
            else:
                try:
                try:
                    resolved_vocab_files[file_id] = cached_path(
                        file_path,
@@ -1683,6 +1664,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
                        local_files_only=local_files_only,
                        use_auth_token=use_auth_token,
                    )
                except FileNotFoundError as error:
                    if local_files_only:
                        unresolved_files.append(file_id)
@@ -1715,9 +1697,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
                continue
            if file_path == resolved_vocab_files[file_id]:
-                logger.info("loading file {}".format(file_path))
+                logger.info(f"loading file {file_path}")
            else:
-                logger.info("loading file {} from cache at {}".format(file_path, resolved_vocab_files[file_id]))
+                logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
        return cls._from_pretrained(
            resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs