Fix tokenizer saving and loading error (#6026)

* fix tokenizer saving and loading bugs when adding AddedToken to additional special tokens

* Add tokenizer test

* Style

* Style 2

Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr>
This commit is contained in:
Junyuan Zheng
2020-08-11 04:49:16 -04:00
committed by GitHub
parent 83984a61c6
commit cdf1f7edb2
2 changed files with 16 additions and 0 deletions

View File

@@ -1165,6 +1165,16 @@ class TokenizerTesterMixin:
encoded_sequences_batch_padded_1[key], encoded_sequences_batch_padded_2[key],
)
def test_added_token_serializable(self):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
new_token = AddedToken("new_token", lstrip=True)
tokenizer.add_special_tokens({"additional_special_tokens": [new_token]})
with tempfile.TemporaryDirectory() as tmp_dir_name:
tokenizer.save_pretrained(tmp_dir_name)
tokenizer.from_pretrained(tmp_dir_name)
def test_batch_encode_plus_padding(self):
# Test that padded sequences are equivalent between batch_encode_plus and encode_plus