[Json configs] Make json prettier for all saved tokenizer files & ensure same json format for all processors (tok + feat_extract) (#17457)

* [Json dump] Make json prettier * correct more tokenizeirs * more patterns * add aggressive test * the aggressive test was actually useful :-) * more tests * Apply suggestions from code review
2022-05-31 17:07:30 +02:00
parent 6ee1474b67
commit f394a2a50d
22 changed files with 52 additions and 24 deletions
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -1494,3 +1494,20 @@ def nested_simplify(obj, decimals=3):
        return nested_simplify(obj.item(), decimals)
    else:
        raise Exception(f"Not supported: {type(obj)}")
+
+
+def check_json_file_has_correct_format(file_path):
+    with open(file_path, "r") as f:
+        lines = f.readlines()
+        if len(lines) == 1:
+            # length can only be 1 if dict is empty
+            assert lines[0] == "{}"
+        else:
+            # otherwise make sure json has correct format (at least 3 lines)
+            assert len(lines) >= 3
+            # each key one line, ident should be 2, min length is 3
+            assert lines[0].strip() == "{"
+            for line in lines[1:-1]:
+                left_indent = len(lines[1]) - len(lines[1].lstrip())
+                assert left_indent == 2
+            assert lines[-1].strip() == "}"