Save Processor (#27761)
* save processor * Update tests/models/auto/test_processor_auto.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update tests/test_processing_common.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com> Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
This commit is contained in:
@@ -42,7 +42,7 @@ from transformers import (
|
||||
)
|
||||
from transformers.testing_utils import TOKEN, USER, get_tests_dir, is_staging_test
|
||||
from transformers.tokenization_utils import TOKENIZER_CONFIG_FILE
|
||||
from transformers.utils import FEATURE_EXTRACTOR_NAME, is_tokenizers_available
|
||||
from transformers.utils import FEATURE_EXTRACTOR_NAME, PROCESSOR_NAME, is_tokenizers_available
|
||||
|
||||
|
||||
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
|
||||
@@ -91,6 +91,28 @@ class AutoFeatureExtractorTest(unittest.TestCase):
|
||||
|
||||
self.assertIsInstance(processor, Wav2Vec2Processor)
|
||||
|
||||
def test_processor_from_processor_class(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
feature_extractor = Wav2Vec2FeatureExtractor()
|
||||
tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
|
||||
|
||||
processor = Wav2Vec2Processor(feature_extractor, tokenizer)
|
||||
|
||||
# save in new folder
|
||||
processor.save_pretrained(tmpdirname)
|
||||
|
||||
# drop `processor_class` in tokenizer config
|
||||
with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE), "r") as f:
|
||||
config_dict = json.load(f)
|
||||
config_dict.pop("processor_class")
|
||||
|
||||
with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE), "w") as f:
|
||||
f.write(json.dumps(config_dict))
|
||||
|
||||
processor = AutoProcessor.from_pretrained(tmpdirname)
|
||||
|
||||
self.assertIsInstance(processor, Wav2Vec2Processor)
|
||||
|
||||
def test_processor_from_feat_extr_processor_class(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
feature_extractor = Wav2Vec2FeatureExtractor()
|
||||
@@ -101,6 +123,14 @@ class AutoFeatureExtractorTest(unittest.TestCase):
|
||||
# save in new folder
|
||||
processor.save_pretrained(tmpdirname)
|
||||
|
||||
# drop `processor_class` in processor
|
||||
with open(os.path.join(tmpdirname, PROCESSOR_NAME), "r") as f:
|
||||
config_dict = json.load(f)
|
||||
config_dict.pop("processor_class")
|
||||
|
||||
with open(os.path.join(tmpdirname, PROCESSOR_NAME), "w") as f:
|
||||
f.write(json.dumps(config_dict))
|
||||
|
||||
# drop `processor_class` in tokenizer
|
||||
with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE), "r") as f:
|
||||
config_dict = json.load(f)
|
||||
@@ -123,6 +153,14 @@ class AutoFeatureExtractorTest(unittest.TestCase):
|
||||
# save in new folder
|
||||
processor.save_pretrained(tmpdirname)
|
||||
|
||||
# drop `processor_class` in processor
|
||||
with open(os.path.join(tmpdirname, PROCESSOR_NAME), "r") as f:
|
||||
config_dict = json.load(f)
|
||||
config_dict.pop("processor_class")
|
||||
|
||||
with open(os.path.join(tmpdirname, PROCESSOR_NAME), "w") as f:
|
||||
f.write(json.dumps(config_dict))
|
||||
|
||||
# drop `processor_class` in feature extractor
|
||||
with open(os.path.join(tmpdirname, FEATURE_EXTRACTOR_NAME), "r") as f:
|
||||
config_dict = json.load(f)
|
||||
@@ -270,6 +308,45 @@ class AutoFeatureExtractorTest(unittest.TestCase):
|
||||
if CustomConfig in PROCESSOR_MAPPING._extra_content:
|
||||
del PROCESSOR_MAPPING._extra_content[CustomConfig]
|
||||
|
||||
def test_from_pretrained_dynamic_processor_with_extra_attributes(self):
|
||||
class NewFeatureExtractor(Wav2Vec2FeatureExtractor):
|
||||
pass
|
||||
|
||||
class NewTokenizer(BertTokenizer):
|
||||
pass
|
||||
|
||||
class NewProcessor(ProcessorMixin):
|
||||
feature_extractor_class = "AutoFeatureExtractor"
|
||||
tokenizer_class = "AutoTokenizer"
|
||||
|
||||
def __init__(self, feature_extractor, tokenizer, processor_attr_1=1, processor_attr_2=True):
|
||||
super().__init__(feature_extractor, tokenizer)
|
||||
|
||||
self.processor_attr_1 = processor_attr_1
|
||||
self.processor_attr_2 = processor_attr_2
|
||||
|
||||
try:
|
||||
AutoConfig.register("custom", CustomConfig)
|
||||
AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor)
|
||||
AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer)
|
||||
AutoProcessor.register(CustomConfig, NewProcessor)
|
||||
# If remote code is not set, the default is to use local classes.
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
"hf-internal-testing/test_dynamic_processor", processor_attr_2=False
|
||||
)
|
||||
self.assertEqual(processor.__class__.__name__, "NewProcessor")
|
||||
self.assertEqual(processor.processor_attr_1, 1)
|
||||
self.assertEqual(processor.processor_attr_2, False)
|
||||
finally:
|
||||
if "custom" in CONFIG_MAPPING._extra_content:
|
||||
del CONFIG_MAPPING._extra_content["custom"]
|
||||
if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
|
||||
del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
|
||||
if CustomConfig in TOKENIZER_MAPPING._extra_content:
|
||||
del TOKENIZER_MAPPING._extra_content[CustomConfig]
|
||||
if CustomConfig in PROCESSOR_MAPPING._extra_content:
|
||||
del PROCESSOR_MAPPING._extra_content[CustomConfig]
|
||||
|
||||
def test_auto_processor_creates_tokenizer(self):
|
||||
processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-bert")
|
||||
self.assertEqual(processor.__class__.__name__, "BertTokenizerFast")
|
||||
|
||||
@@ -26,6 +26,8 @@ from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
|
||||
from transformers.testing_utils import require_vision
|
||||
from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
|
||||
|
||||
from ...test_processing_common import ProcessorTesterMixin
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
@@ -34,7 +36,9 @@ if is_vision_available():
|
||||
|
||||
|
||||
@require_vision
|
||||
class CLIPProcessorTest(unittest.TestCase):
|
||||
class CLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
processor_class = CLIPProcessor
|
||||
|
||||
def setUp(self):
|
||||
self.tmpdirname = tempfile.mkdtemp()
|
||||
|
||||
|
||||
127
tests/test_processing_common.py
Normal file
127
tests/test_processing_common.py
Normal file
@@ -0,0 +1,127 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import json
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import CLIPTokenizerFast, ProcessorMixin
|
||||
from transformers.models.auto.processing_auto import processor_class_from_name
|
||||
from transformers.testing_utils import (
|
||||
check_json_file_has_correct_format,
|
||||
require_tokenizers,
|
||||
require_torch,
|
||||
require_vision,
|
||||
)
|
||||
from transformers.utils import is_vision_available
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from transformers import CLIPImageProcessor
|
||||
|
||||
|
||||
@require_torch
|
||||
class ProcessorTesterMixin:
|
||||
processor_class = None
|
||||
|
||||
def prepare_processor_dict(self):
|
||||
return {}
|
||||
|
||||
def get_component(self, attribute, **kwargs):
|
||||
assert attribute in self.processor_class.attributes
|
||||
component_class_name = getattr(self.processor_class, f"{attribute}_class")
|
||||
if isinstance(component_class_name, tuple):
|
||||
component_class_name = component_class_name[0]
|
||||
|
||||
component_class = processor_class_from_name(component_class_name)
|
||||
component = component_class.from_pretrained(self.tmpdirname, **kwargs) # noqa
|
||||
|
||||
return component
|
||||
|
||||
def prepare_components(self):
|
||||
components = {}
|
||||
for attribute in self.processor_class.attributes:
|
||||
component = self.get_component(attribute)
|
||||
components[attribute] = component
|
||||
|
||||
return components
|
||||
|
||||
def get_processor(self):
|
||||
components = self.prepare_components()
|
||||
processor = self.processor_class(**components, **self.prepare_processor_dict())
|
||||
return processor
|
||||
|
||||
def test_processor_to_json_string(self):
|
||||
processor = self.get_processor()
|
||||
obj = json.loads(processor.to_json_string())
|
||||
for key, value in self.prepare_processor_dict().items():
|
||||
self.assertEqual(obj[key], value)
|
||||
self.assertEqual(getattr(processor, key, None), value)
|
||||
|
||||
def test_processor_from_and_save_pretrained(self):
|
||||
processor_first = self.get_processor()
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
saved_file = processor_first.save_pretrained(tmpdirname)[0]
|
||||
check_json_file_has_correct_format(saved_file)
|
||||
processor_second = self.processor_class.from_pretrained(tmpdirname)
|
||||
|
||||
self.assertEqual(processor_second.to_dict(), processor_first.to_dict())
|
||||
|
||||
|
||||
class MyProcessor(ProcessorMixin):
|
||||
attributes = ["image_processor", "tokenizer"]
|
||||
image_processor_class = "CLIPImageProcessor"
|
||||
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
|
||||
|
||||
def __init__(self, image_processor=None, tokenizer=None, processor_attr_1=1, processor_attr_2=True):
|
||||
super().__init__(image_processor, tokenizer)
|
||||
|
||||
self.processor_attr_1 = processor_attr_1
|
||||
self.processor_attr_2 = processor_attr_2
|
||||
|
||||
|
||||
@require_tokenizers
|
||||
@require_vision
|
||||
class ProcessorTest(unittest.TestCase):
|
||||
processor_class = MyProcessor
|
||||
|
||||
def prepare_processor_dict(self):
|
||||
return {"processor_attr_1": 1, "processor_attr_2": False}
|
||||
|
||||
def get_processor(self):
|
||||
image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
|
||||
tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
|
||||
processor = MyProcessor(image_processor, tokenizer, **self.prepare_processor_dict())
|
||||
|
||||
return processor
|
||||
|
||||
def test_processor_to_json_string(self):
|
||||
processor = self.get_processor()
|
||||
obj = json.loads(processor.to_json_string())
|
||||
for key, value in self.prepare_processor_dict().items():
|
||||
self.assertEqual(obj[key], value)
|
||||
self.assertEqual(getattr(processor, key, None), value)
|
||||
|
||||
def test_processor_from_and_save_pretrained(self):
|
||||
processor_first = self.get_processor()
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
saved_file = processor_first.save_pretrained(tmpdirname)[0]
|
||||
check_json_file_has_correct_format(saved_file)
|
||||
processor_second = self.processor_class.from_pretrained(tmpdirname)
|
||||
|
||||
self.assertEqual(processor_second.to_dict(), processor_first.to_dict())
|
||||
Reference in New Issue
Block a user