[Json configs] Make json prettier for all saved tokenizer files & ensure same json format for all processors (tok + feat_extract) (#17457)
* [Json dump] Make json prettier * correct more tokenizeirs * more patterns * add aggressive test * the aggressive test was actually useful :-) * more tests * Apply suggestions from code review
This commit is contained in:
committed by
GitHub
parent
6ee1474b67
commit
f394a2a50d
@@ -2118,13 +2118,15 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
custom_object_save(self, save_directory, config=tokenizer_config)
|
||||
|
||||
with open(tokenizer_config_file, "w", encoding="utf-8") as f:
|
||||
f.write(json.dumps(tokenizer_config, ensure_ascii=False))
|
||||
out_str = json.dumps(tokenizer_config, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
|
||||
f.write(out_str)
|
||||
logger.info(f"tokenizer config file saved in {tokenizer_config_file}")
|
||||
|
||||
# Sanitize AddedTokens in special_tokens_map
|
||||
write_dict = convert_added_tokens(self.special_tokens_map_extended, add_type_field=False)
|
||||
with open(special_tokens_map_file, "w", encoding="utf-8") as f:
|
||||
f.write(json.dumps(write_dict, ensure_ascii=False))
|
||||
out_str = json.dumps(write_dict, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
|
||||
f.write(out_str)
|
||||
logger.info(f"Special tokens file saved in {special_tokens_map_file}")
|
||||
|
||||
file_names = (tokenizer_config_file, special_tokens_map_file)
|
||||
@@ -2168,7 +2170,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
added_vocab = self.get_added_vocab()
|
||||
if added_vocab:
|
||||
with open(added_tokens_file, "w", encoding="utf-8") as f:
|
||||
out_str = json.dumps(added_vocab, ensure_ascii=False)
|
||||
out_str = json.dumps(added_vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
|
||||
f.write(out_str)
|
||||
logger.info(f"added tokens file saved in {added_tokens_file}")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user