From 89693e170da06126147c6de44e74685bdfa74506 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Thu, 11 Mar 2021 11:11:56 -0500 Subject: [PATCH] Remove special treatment for custom vocab files (#10637) * Remove special path for custom vocab files * Update src/transformers/tokenization_utils_base.py Co-authored-by: Patrick von Platen * Expand error message Co-authored-by: Patrick von Platen --- src/transformers/tokenization_utils_base.py | 128 +++++++++----------- 1 file changed, 55 insertions(+), 73 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 20678875d7..92614e154e 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1601,69 +1601,51 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): logger.info("Offline mode: forcing local_files_only=True") local_files_only = True - s3_models = list(cls.max_model_input_sizes.keys()) pretrained_model_name_or_path = str(pretrained_model_name_or_path) vocab_files = {} init_configuration = {} - if pretrained_model_name_or_path in s3_models: - # Get the vocabulary from AWS S3 bucket - for file_id, map_list in cls.pretrained_vocab_files_map.items(): - vocab_files[file_id] = map_list[pretrained_model_name_or_path] - if ( - cls.pretrained_init_configuration - and pretrained_model_name_or_path in cls.pretrained_init_configuration - ): - init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path].copy() - else: - # Get the vocabulary from local files - logger.info( - "Model name '{}' not found in model shortcut name list ({}). " - "Assuming '{}' is a path, a model identifier, or url to a directory containing tokenizer files.".format( - pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path + + if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): + if len(cls.vocab_files_names) > 1: + raise ValueError( + f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is not " + "supported for this tokenizer. Use a model identifier or the path to a directory instead." ) + warnings.warn( + f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is deprecated and " + "won't be possible anymore in v5. Use a model identifier or the path to a directory instead.", + FutureWarning, ) - - if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): - if len(cls.vocab_files_names) > 1: - raise ValueError( - "Calling {}.from_pretrained() with the path to a single file or url is not supported." - "Use a model identifier or the path to a directory instead.".format(cls.__name__) - ) - logger.warning( - "Calling {}.from_pretrained() with the path to a single file or url is deprecated".format( - cls.__name__ - ) - ) - file_id = list(cls.vocab_files_names.keys())[0] - vocab_files[file_id] = pretrained_model_name_or_path - else: - # At this point pretrained_model_name_or_path is either a directory or a model identifier name - additional_files_names = { - "added_tokens_file": ADDED_TOKENS_FILE, - "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, - "tokenizer_config_file": TOKENIZER_CONFIG_FILE, - "tokenizer_file": FULL_TOKENIZER_FILE, - } - # Look for the tokenizer files - for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items(): - if os.path.isdir(pretrained_model_name_or_path): - if subfolder is not None: - full_file_name = os.path.join(pretrained_model_name_or_path, subfolder, file_name) - else: - full_file_name = os.path.join(pretrained_model_name_or_path, file_name) - if not os.path.exists(full_file_name): - logger.info("Didn't find file {}. We won't load it.".format(full_file_name)) - full_file_name = None + file_id = list(cls.vocab_files_names.keys())[0] + vocab_files[file_id] = pretrained_model_name_or_path + else: + # At this point pretrained_model_name_or_path is either a directory or a model identifier name + additional_files_names = { + "added_tokens_file": ADDED_TOKENS_FILE, + "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, + "tokenizer_config_file": TOKENIZER_CONFIG_FILE, + "tokenizer_file": FULL_TOKENIZER_FILE, + } + # Look for the tokenizer files + for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items(): + if os.path.isdir(pretrained_model_name_or_path): + if subfolder is not None: + full_file_name = os.path.join(pretrained_model_name_or_path, subfolder, file_name) else: - full_file_name = hf_bucket_url( - pretrained_model_name_or_path, - filename=file_name, - subfolder=subfolder, - revision=revision, - mirror=None, - ) + full_file_name = os.path.join(pretrained_model_name_or_path, file_name) + if not os.path.exists(full_file_name): + logger.info(f"Didn't find file {full_file_name}. We won't load it.") + full_file_name = None + else: + full_file_name = hf_bucket_url( + pretrained_model_name_or_path, + filename=file_name, + subfolder=subfolder, + revision=revision, + mirror=None, + ) - vocab_files[file_id] = full_file_name + vocab_files[file_id] = full_file_name # Get files from url, cache, or disk depending on the case resolved_vocab_files = {} @@ -1673,21 +1655,21 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): resolved_vocab_files[file_id] = None else: try: - try: - resolved_vocab_files[file_id] = cached_path( - file_path, - cache_dir=cache_dir, - force_download=force_download, - proxies=proxies, - resume_download=resume_download, - local_files_only=local_files_only, - use_auth_token=use_auth_token, - ) - except FileNotFoundError as error: - if local_files_only: - unresolved_files.append(file_id) - else: - raise error + resolved_vocab_files[file_id] = cached_path( + file_path, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + ) + + except FileNotFoundError as error: + if local_files_only: + unresolved_files.append(file_id) + else: + raise error except requests.exceptions.HTTPError as err: if "404 Client Error" in str(err): @@ -1715,9 +1697,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): continue if file_path == resolved_vocab_files[file_id]: - logger.info("loading file {}".format(file_path)) + logger.info(f"loading file {file_path}") else: - logger.info("loading file {} from cache at {}".format(file_path, resolved_vocab_files[file_id])) + logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}") return cls._from_pretrained( resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs