Save Processor (#27761)

* save processor

* Update tests/models/auto/test_processor_auto.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* Update tests/test_processing_common.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
This commit is contained in:
Yih-Dar
2024-01-18 11:21:45 +01:00
committed by GitHub
parent 98dda8ed03
commit 3005f96552
6 changed files with 480 additions and 19 deletions

View File

@@ -42,7 +42,7 @@ from transformers import (
)
from transformers.testing_utils import TOKEN, USER, get_tests_dir, is_staging_test
from transformers.tokenization_utils import TOKENIZER_CONFIG_FILE
from transformers.utils import FEATURE_EXTRACTOR_NAME, is_tokenizers_available
from transformers.utils import FEATURE_EXTRACTOR_NAME, PROCESSOR_NAME, is_tokenizers_available
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
@@ -91,6 +91,28 @@ class AutoFeatureExtractorTest(unittest.TestCase):
self.assertIsInstance(processor, Wav2Vec2Processor)
def test_processor_from_processor_class(self):
with tempfile.TemporaryDirectory() as tmpdirname:
feature_extractor = Wav2Vec2FeatureExtractor()
tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
processor = Wav2Vec2Processor(feature_extractor, tokenizer)
# save in new folder
processor.save_pretrained(tmpdirname)
# drop `processor_class` in tokenizer config
with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE), "r") as f:
config_dict = json.load(f)
config_dict.pop("processor_class")
with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE), "w") as f:
f.write(json.dumps(config_dict))
processor = AutoProcessor.from_pretrained(tmpdirname)
self.assertIsInstance(processor, Wav2Vec2Processor)
def test_processor_from_feat_extr_processor_class(self):
with tempfile.TemporaryDirectory() as tmpdirname:
feature_extractor = Wav2Vec2FeatureExtractor()
@@ -101,6 +123,14 @@ class AutoFeatureExtractorTest(unittest.TestCase):
# save in new folder
processor.save_pretrained(tmpdirname)
# drop `processor_class` in processor
with open(os.path.join(tmpdirname, PROCESSOR_NAME), "r") as f:
config_dict = json.load(f)
config_dict.pop("processor_class")
with open(os.path.join(tmpdirname, PROCESSOR_NAME), "w") as f:
f.write(json.dumps(config_dict))
# drop `processor_class` in tokenizer
with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE), "r") as f:
config_dict = json.load(f)
@@ -123,6 +153,14 @@ class AutoFeatureExtractorTest(unittest.TestCase):
# save in new folder
processor.save_pretrained(tmpdirname)
# drop `processor_class` in processor
with open(os.path.join(tmpdirname, PROCESSOR_NAME), "r") as f:
config_dict = json.load(f)
config_dict.pop("processor_class")
with open(os.path.join(tmpdirname, PROCESSOR_NAME), "w") as f:
f.write(json.dumps(config_dict))
# drop `processor_class` in feature extractor
with open(os.path.join(tmpdirname, FEATURE_EXTRACTOR_NAME), "r") as f:
config_dict = json.load(f)
@@ -270,6 +308,45 @@ class AutoFeatureExtractorTest(unittest.TestCase):
if CustomConfig in PROCESSOR_MAPPING._extra_content:
del PROCESSOR_MAPPING._extra_content[CustomConfig]
def test_from_pretrained_dynamic_processor_with_extra_attributes(self):
class NewFeatureExtractor(Wav2Vec2FeatureExtractor):
pass
class NewTokenizer(BertTokenizer):
pass
class NewProcessor(ProcessorMixin):
feature_extractor_class = "AutoFeatureExtractor"
tokenizer_class = "AutoTokenizer"
def __init__(self, feature_extractor, tokenizer, processor_attr_1=1, processor_attr_2=True):
super().__init__(feature_extractor, tokenizer)
self.processor_attr_1 = processor_attr_1
self.processor_attr_2 = processor_attr_2
try:
AutoConfig.register("custom", CustomConfig)
AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor)
AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer)
AutoProcessor.register(CustomConfig, NewProcessor)
# If remote code is not set, the default is to use local classes.
processor = AutoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_processor", processor_attr_2=False
)
self.assertEqual(processor.__class__.__name__, "NewProcessor")
self.assertEqual(processor.processor_attr_1, 1)
self.assertEqual(processor.processor_attr_2, False)
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
if CustomConfig in TOKENIZER_MAPPING._extra_content:
del TOKENIZER_MAPPING._extra_content[CustomConfig]
if CustomConfig in PROCESSOR_MAPPING._extra_content:
del PROCESSOR_MAPPING._extra_content[CustomConfig]
def test_auto_processor_creates_tokenizer(self):
processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-bert")
self.assertEqual(processor.__class__.__name__, "BertTokenizerFast")