[Json configs] Make json prettier for all saved tokenizer files & ensure same json format for all processors (tok + feat_extract) (#17457)
* [Json dump] Make json prettier * correct more tokenizeirs * more patterns * add aggressive test * the aggressive test was actually useful :-) * more tests * Apply suggestions from code review
This commit is contained in:
committed by
GitHub
parent
6ee1474b67
commit
f394a2a50d
@@ -1494,3 +1494,20 @@ def nested_simplify(obj, decimals=3):
|
||||
return nested_simplify(obj.item(), decimals)
|
||||
else:
|
||||
raise Exception(f"Not supported: {type(obj)}")
|
||||
|
||||
|
||||
def check_json_file_has_correct_format(file_path):
|
||||
with open(file_path, "r") as f:
|
||||
lines = f.readlines()
|
||||
if len(lines) == 1:
|
||||
# length can only be 1 if dict is empty
|
||||
assert lines[0] == "{}"
|
||||
else:
|
||||
# otherwise make sure json has correct format (at least 3 lines)
|
||||
assert len(lines) >= 3
|
||||
# each key one line, ident should be 2, min length is 3
|
||||
assert lines[0].strip() == "{"
|
||||
for line in lines[1:-1]:
|
||||
left_indent = len(lines[1]) - len(lines[1].lstrip())
|
||||
assert left_indent == 2
|
||||
assert lines[-1].strip() == "}"
|
||||
|
||||
Reference in New Issue
Block a user