[Json configs] Make json prettier for all saved tokenizer files & ensure same json format for all processors (tok + feat_extract) (#17457)

* [Json dump] Make json prettier

* correct more tokenizeirs

* more patterns

* add aggressive test

* the aggressive test was actually useful :-)

* more tests

* Apply suggestions from code review
This commit is contained in:
Patrick von Platen
2022-05-31 17:07:30 +02:00
committed by GitHub
parent 6ee1474b67
commit f394a2a50d
22 changed files with 52 additions and 24 deletions

View File

@@ -1494,3 +1494,20 @@ def nested_simplify(obj, decimals=3):
return nested_simplify(obj.item(), decimals)
else:
raise Exception(f"Not supported: {type(obj)}")
def check_json_file_has_correct_format(file_path):
with open(file_path, "r") as f:
lines = f.readlines()
if len(lines) == 1:
# length can only be 1 if dict is empty
assert lines[0] == "{}"
else:
# otherwise make sure json has correct format (at least 3 lines)
assert len(lines) >= 3
# each key one line, ident should be 2, min length is 3
assert lines[0].strip() == "{"
for line in lines[1:-1]:
left_indent = len(lines[1]) - len(lines[1].lstrip())
assert left_indent == 2
assert lines[-1].strip() == "}"