From 9a5b84a0076a04fe9596da72e8668069d4f09ea0 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Wed, 23 Nov 2022 18:16:26 +0100 Subject: [PATCH] Use updated `model_max_length` when saving tokenizers (#20401) * Use updated values Co-authored-by: ydshieh --- src/transformers/tokenization_utils_base.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 19cfb5b88d..b58ffc69c2 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2082,6 +2082,14 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ) tokenizer_config = copy.deepcopy(self.init_kwargs) + + # TODO: Ensure the modified attributes (those are also in the __init__ kwargs) will give identical tokenizers + # target_keys = self.init_kwargs.keys() + target_keys = ["model_max_length"] + for k in target_keys: + if hasattr(self, k): + tokenizer_config[k] = getattr(self, k) + if len(self.init_inputs) > 0: tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs) for file_id in self.vocab_files_names.keys():