diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 267d72485f..cf12f2d720 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1562,6 +1562,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): for key, value in special_tokens_map.items(): if isinstance(value, dict): value = AddedToken(**value) + elif isinstance(value, list): + value = [AddedToken(**token) if isinstance(token, dict) else token for token in value] setattr(tokenizer, key, value) # Add supplementary tokens. @@ -1633,6 +1635,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): for key, value in self.special_tokens_map_extended.items(): if isinstance(value, AddedToken): write_dict[key] = value.__getstate__() + elif isinstance(value, list): + write_dict[key] = [ + token.__getstate__() if isinstance(token, AddedToken) else token for token in value + ] else: write_dict[key] = value f.write(json.dumps(write_dict, ensure_ascii=False)) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index ba891f0cbb..4b841f850e 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -1165,6 +1165,16 @@ class TokenizerTesterMixin: encoded_sequences_batch_padded_1[key], encoded_sequences_batch_padded_2[key], ) + def test_added_token_serializable(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + new_token = AddedToken("new_token", lstrip=True) + tokenizer.add_special_tokens({"additional_special_tokens": [new_token]}) + + with tempfile.TemporaryDirectory() as tmp_dir_name: + tokenizer.save_pretrained(tmp_dir_name) + tokenizer.from_pretrained(tmp_dir_name) + def test_batch_encode_plus_padding(self): # Test that padded sequences are equivalent between batch_encode_plus and encode_plus