[Json configs] Make json prettier for all saved tokenizer files & ensure same json format for all processors (tok + feat_extract) (#17457)

* [Json dump] Make json prettier

* correct more tokenizeirs

* more patterns

* add aggressive test

* the aggressive test was actually useful :-)

* more tests

* Apply suggestions from code review
This commit is contained in:
Patrick von Platen
2022-05-31 17:07:30 +02:00
committed by GitHub
parent 6ee1474b67
commit f394a2a50d
22 changed files with 52 additions and 24 deletions

View File

@@ -51,6 +51,7 @@ from transformers import (
from transformers.testing_utils import (
PASS,
USER,
check_json_file_has_correct_format,
get_tests_dir,
is_pt_tf_cross_test,
is_staging_test,
@@ -3325,6 +3326,11 @@ class TokenizerTesterMixin:
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
# make sure that all ".json" files are saved in the correct format
for file_path in tokenizer_r_files + tokenizer_p_files:
if os.path.exists(file_path) and file_path.endswith(".json"):
check_json_file_has_correct_format(file_path)
# Checks it save with the same files + the tokenizer.json file for the fast one
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)