Fix tokenizer saving and loading error (#6026)

* fix tokenizer saving and loading bugs when adding AddedToken to additional special tokens

* Add tokenizer test

* Style

* Style 2

Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr>
This commit is contained in:
Junyuan Zheng
2020-08-11 04:49:16 -04:00
committed by GitHub
parent 83984a61c6
commit cdf1f7edb2
2 changed files with 16 additions and 0 deletions

View File

@@ -1562,6 +1562,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
for key, value in special_tokens_map.items():
if isinstance(value, dict):
value = AddedToken(**value)
elif isinstance(value, list):
value = [AddedToken(**token) if isinstance(token, dict) else token for token in value]
setattr(tokenizer, key, value)
# Add supplementary tokens.
@@ -1633,6 +1635,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
for key, value in self.special_tokens_map_extended.items():
if isinstance(value, AddedToken):
write_dict[key] = value.__getstate__()
elif isinstance(value, list):
write_dict[key] = [
token.__getstate__() if isinstance(token, AddedToken) else token for token in value
]
else:
write_dict[key] = value
f.write(json.dumps(write_dict, ensure_ascii=False))