Fix tokenizer saving and loading error (#6026)

* fix tokenizer saving and loading bugs when adding AddedToken to additional special tokens * Add tokenizer test * Style * Style 2 Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr>
2020-08-11 04:49:16 -04:00
parent 83984a61c6
commit cdf1f7edb2
2 changed files with 16 additions and 0 deletions
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -1165,6 +1165,16 @@ class TokenizerTesterMixin:
                        encoded_sequences_batch_padded_1[key], encoded_sequences_batch_padded_2[key],
                    )

+    def test_added_token_serializable(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            new_token = AddedToken("new_token", lstrip=True)
+            tokenizer.add_special_tokens({"additional_special_tokens": [new_token]})
+
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                tokenizer.save_pretrained(tmp_dir_name)
+                tokenizer.from_pretrained(tmp_dir_name)
+
    def test_batch_encode_plus_padding(self):
        # Test that padded sequences are equivalent between batch_encode_plus and encode_plus