From b9af152efb748b1bff8f6fe0130e62ebb8e11a53 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 7 Feb 2023 10:51:45 -0800 Subject: [PATCH] [tokenizer] sanitize saved config (#21483) * [tokenizer] sanitize saved config * rm config["name_or_path"] test --- src/transformers/tokenization_utils_base.py | 4 ++++ tests/models/auto/test_tokenization_auto.py | 2 -- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 77a8db7952..3d26a231ee 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2153,6 +2153,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): if self._auto_class is not None: custom_object_save(self, save_directory, config=tokenizer_config) + # remove private information + if "name_or_path" in tokenizer_config: + tokenizer_config.pop("name_or_path") + with open(tokenizer_config_file, "w", encoding="utf-8") as f: out_str = json.dumps(tokenizer_config, indent=2, sort_keys=True, ensure_ascii=False) + "\n" f.write(out_str) diff --git a/tests/models/auto/test_tokenization_auto.py b/tests/models/auto/test_tokenization_auto.py index 5814a76c37..a919ac3eda 100644 --- a/tests/models/auto/test_tokenization_auto.py +++ b/tests/models/auto/test_tokenization_auto.py @@ -230,8 +230,6 @@ class AutoTokenizerTest(unittest.TestCase): # Check the class of the tokenizer was properly saved (note that it always saves the slow class). self.assertEqual(config["tokenizer_class"], "BertTokenizer") - # Check other keys just to make sure the config was properly saved /reloaded. - self.assertEqual(config["name_or_path"], SMALL_MODEL_IDENTIFIER) def test_new_tokenizer_registration(self): try: