[Json configs] Make json prettier for all saved tokenizer files & ensure same json format for all processors (tok + feat_extract) (#17457)
* [Json dump] Make json prettier * correct more tokenizeirs * more patterns * add aggressive test * the aggressive test was actually useful :-) * more tests * Apply suggestions from code review
This commit is contained in:
committed by
GitHub
parent
6ee1474b67
commit
f394a2a50d
@@ -25,7 +25,7 @@ from pathlib import Path
|
||||
from huggingface_hub import Repository, delete_repo, login
|
||||
from requests.exceptions import HTTPError
|
||||
from transformers import AutoFeatureExtractor, Wav2Vec2FeatureExtractor
|
||||
from transformers.testing_utils import PASS, USER, get_tests_dir, is_staging_test
|
||||
from transformers.testing_utils import PASS, USER, check_json_file_has_correct_format, get_tests_dir, is_staging_test
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
|
||||
|
||||
@@ -107,7 +107,8 @@ class FeatureExtractionSavingTestMixin:
|
||||
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
feat_extract_first.save_pretrained(tmpdirname)
|
||||
saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
|
||||
check_json_file_has_correct_format(saved_file)
|
||||
feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
|
||||
|
||||
self.assertEqual(feat_extract_second.to_dict(), feat_extract_first.to_dict())
|
||||
|
||||
@@ -51,6 +51,7 @@ from transformers import (
|
||||
from transformers.testing_utils import (
|
||||
PASS,
|
||||
USER,
|
||||
check_json_file_has_correct_format,
|
||||
get_tests_dir,
|
||||
is_pt_tf_cross_test,
|
||||
is_staging_test,
|
||||
@@ -3325,6 +3326,11 @@ class TokenizerTesterMixin:
|
||||
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
|
||||
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
|
||||
|
||||
# make sure that all ".json" files are saved in the correct format
|
||||
for file_path in tokenizer_r_files + tokenizer_p_files:
|
||||
if os.path.exists(file_path) and file_path.endswith(".json"):
|
||||
check_json_file_has_correct_format(file_path)
|
||||
|
||||
# Checks it save with the same files + the tokenizer.json file for the fast one
|
||||
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
|
||||
tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
|
||||
|
||||
Reference in New Issue
Block a user