[Json configs] Make json prettier for all saved tokenizer files & ensure same json format for all processors (tok + feat_extract) (#17457)

* [Json dump] Make json prettier * correct more tokenizeirs * more patterns * add aggressive test * the aggressive test was actually useful :-) * more tests * Apply suggestions from code review
2022-05-31 17:07:30 +02:00
parent 6ee1474b67
commit f394a2a50d
22 changed files with 52 additions and 24 deletions
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -51,6 +51,7 @@ from transformers import (
 from transformers.testing_utils import (
    PASS,
    USER,
+    check_json_file_has_correct_format,
    get_tests_dir,
    is_pt_tf_cross_test,
    is_staging_test,
@@ -3325,6 +3326,11 @@ class TokenizerTesterMixin:
                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)

+                # make sure that all ".json" files are saved in the correct format
+                for file_path in tokenizer_r_files + tokenizer_p_files:
+                    if os.path.exists(file_path) and file_path.endswith(".json"):
+                        check_json_file_has_correct_format(file_path)
+
                # Checks it save with the same files + the tokenizer.json file for the fast one
                self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
                tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)