From 8fae93ca1972c39d19c8cf3d3c6a3dd2530cc59a Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Sat, 13 Feb 2021 06:10:22 -0800 Subject: [PATCH] [t5 tokenizer] add info logs (#9897) * save fast tokenizer + add info logs * fix tests * remove the saving of fast tokenizer --- src/transformers/configuration_utils.py | 2 +- src/transformers/models/t5/tokenization_t5.py | 1 + src/transformers/models/t5/tokenization_t5_fast.py | 1 + src/transformers/tokenization_utils_base.py | 3 +++ 4 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 0fba6fc32a..249c8f9ddf 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -308,7 +308,7 @@ class PretrainedConfig(object): output_config_file = os.path.join(save_directory, CONFIG_NAME) self.to_json_file(output_config_file, use_diff=True) - logger.info("Configuration saved in {}".format(output_config_file)) + logger.info(f"Configuration saved in {output_config_file}") @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py index 4dcd51d494..0619bdfad1 100644 --- a/src/transformers/models/t5/tokenization_t5.py +++ b/src/transformers/models/t5/tokenization_t5.py @@ -295,5 +295,6 @@ class T5Tokenizer(PreTrainedTokenizer): if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) + logger.info(f"Copy vocab file to {out_vocab_file}") return (out_vocab_file,) diff --git a/src/transformers/models/t5/tokenization_t5_fast.py b/src/transformers/models/t5/tokenization_t5_fast.py index e5f93f20da..a8a9fcb2f1 100644 --- a/src/transformers/models/t5/tokenization_t5_fast.py +++ b/src/transformers/models/t5/tokenization_t5_fast.py @@ -160,6 +160,7 @@ class T5TokenizerFast(PreTrainedTokenizerFast): if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) + logger.info(f"Copy vocab file to {out_vocab_file}") return (out_vocab_file,) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 85cee8f2c3..1836a2dd3d 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1977,11 +1977,13 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): tokenizer_config = convert_added_tokens(tokenizer_config, add_type_field=True) with open(tokenizer_config_file, "w", encoding="utf-8") as f: f.write(json.dumps(tokenizer_config, ensure_ascii=False)) + logger.info(f"tokenizer config file saved in {tokenizer_config_file}") # Sanitize AddedTokens in special_tokens_map write_dict = convert_added_tokens(self.special_tokens_map_extended, add_type_field=False) with open(special_tokens_map_file, "w", encoding="utf-8") as f: f.write(json.dumps(write_dict, ensure_ascii=False)) + logger.info(f"Special tokens file saved in {special_tokens_map_file}") file_names = (tokenizer_config_file, special_tokens_map_file) @@ -2020,6 +2022,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): with open(added_tokens_file, "w", encoding="utf-8") as f: out_str = json.dumps(added_vocab, ensure_ascii=False) f.write(out_str) + logger.info(f"added tokens file saved in {added_tokens_file}") vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)