From 3005f965524c1bac5512c881541089890b020aa7 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Thu, 18 Jan 2024 11:21:45 +0100 Subject: [PATCH] Save `Processor` (#27761) * save processor * Update tests/models/auto/test_processor_auto.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update tests/test_processing_common.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * fix --------- Co-authored-by: ydshieh Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- .../models/auto/processing_auto.py | 41 ++- src/transformers/processing_utils.py | 245 +++++++++++++++++- src/transformers/utils/__init__.py | 1 + tests/models/auto/test_processor_auto.py | 79 +++++- tests/models/clip/test_processor_clip.py | 6 +- tests/test_processing_common.py | 127 +++++++++ 6 files changed, 480 insertions(+), 19 deletions(-) create mode 100644 tests/test_processing_common.py diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index eee8af931e..208cd53ac2 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -25,8 +25,9 @@ from ...configuration_utils import PretrainedConfig from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code from ...feature_extraction_utils import FeatureExtractionMixin from ...image_processing_utils import ImageProcessingMixin +from ...processing_utils import ProcessorMixin from ...tokenization_utils import TOKENIZER_CONFIG_FILE -from ...utils import FEATURE_EXTRACTOR_NAME, get_file_from_repo, logging +from ...utils import FEATURE_EXTRACTOR_NAME, PROCESSOR_NAME, get_file_from_repo, logging from .auto_factory import _LazyAutoMapping from .configuration_auto import ( CONFIG_MAPPING_NAMES, @@ -227,27 +228,41 @@ class AutoProcessor: processor_class = None processor_auto_map = None - # First, let's see if we have a preprocessor config. + # First, let's see if we have a processor or preprocessor config. # Filter the kwargs for `get_file_from_repo`. get_file_from_repo_kwargs = { key: kwargs[key] for key in inspect.signature(get_file_from_repo).parameters.keys() if key in kwargs } - # Let's start by checking whether the processor class is saved in an image processor - preprocessor_config_file = get_file_from_repo( - pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME, **get_file_from_repo_kwargs + + # Let's start by checking whether the processor class is saved in a processor config + processor_config_file = get_file_from_repo( + pretrained_model_name_or_path, PROCESSOR_NAME, **get_file_from_repo_kwargs ) - if preprocessor_config_file is not None: - config_dict, _ = ImageProcessingMixin.get_image_processor_dict(pretrained_model_name_or_path, **kwargs) + if processor_config_file is not None: + config_dict, _ = ProcessorMixin.get_processor_dict(pretrained_model_name_or_path, **kwargs) processor_class = config_dict.get("processor_class", None) if "AutoProcessor" in config_dict.get("auto_map", {}): processor_auto_map = config_dict["auto_map"]["AutoProcessor"] - # If not found, let's check whether the processor class is saved in a feature extractor config - if preprocessor_config_file is not None and processor_class is None: - config_dict, _ = FeatureExtractionMixin.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs) - processor_class = config_dict.get("processor_class", None) - if "AutoProcessor" in config_dict.get("auto_map", {}): - processor_auto_map = config_dict["auto_map"]["AutoProcessor"] + if processor_class is None: + # If not found, let's check whether the processor class is saved in an image processor config + preprocessor_config_file = get_file_from_repo( + pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME, **get_file_from_repo_kwargs + ) + if preprocessor_config_file is not None: + config_dict, _ = ImageProcessingMixin.get_image_processor_dict(pretrained_model_name_or_path, **kwargs) + processor_class = config_dict.get("processor_class", None) + if "AutoProcessor" in config_dict.get("auto_map", {}): + processor_auto_map = config_dict["auto_map"]["AutoProcessor"] + + # If not found, let's check whether the processor class is saved in a feature extractor config + if preprocessor_config_file is not None and processor_class is None: + config_dict, _ = FeatureExtractionMixin.get_feature_extractor_dict( + pretrained_model_name_or_path, **kwargs + ) + processor_class = config_dict.get("processor_class", None) + if "AutoProcessor" in config_dict.get("auto_map", {}): + processor_auto_map = config_dict["auto_map"]["AutoProcessor"] if processor_class is None: # Next, let's check whether the processor class is saved in a tokenizer diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 41236fe9e1..01c824f92c 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -16,14 +16,28 @@ Processing saving/loading class for common processors. """ +import copy +import inspect +import json import os import warnings from pathlib import Path -from typing import Optional, Union +from typing import Any, Dict, Optional, Tuple, Union from .dynamic_module_utils import custom_object_save from .tokenization_utils_base import PreTrainedTokenizerBase -from .utils import PushToHubMixin, copy_func, direct_transformers_import, logging +from .utils import ( + PROCESSOR_NAME, + PushToHubMixin, + add_model_info_to_auto_map, + cached_file, + copy_func, + direct_transformers_import, + download_url, + is_offline_mode, + is_remote_url, + logging, +) logger = logging.get_logger(__name__) @@ -85,10 +99,70 @@ class ProcessorMixin(PushToHubMixin): setattr(self, attribute_name, arg) + def to_dict(self) -> Dict[str, Any]: + """ + Serializes this instance to a Python dictionary. + + Returns: + `Dict[str, Any]`: Dictionary of all the attributes that make up this processor instance. + """ + output = copy.deepcopy(self.__dict__) + + # Get the kwargs in `__init__`. + sig = inspect.signature(self.__init__) + # Only save the attributes that are presented in the kwargs of `__init__`. + attrs_to_save = sig.parameters + # Don't save attributes like `tokenizer`, `image processor` etc. + attrs_to_save = [x for x in attrs_to_save if x not in self.__class__.attributes] + # extra attributes to be kept + attrs_to_save += ["auto_map"] + + output = {k: v for k, v in output.items() if k in attrs_to_save} + + output["processor_class"] = self.__class__.__name__ + + if "tokenizer" in output: + del output["tokenizer"] + if "image_processor" in output: + del output["image_processor"] + if "feature_extractor" in output: + del output["feature_extractor"] + + # Some attributes have different names but containing objects that are not simple strings + output = { + k: v + for k, v in output.items() + if not (isinstance(v, PushToHubMixin) or v.__class__.__name__ == "BeamSearchDecoderCTC") + } + + return output + + def to_json_string(self) -> str: + """ + Serializes this instance to a JSON string. + + Returns: + `str`: String containing all the attributes that make up this feature_extractor instance in JSON format. + """ + dictionary = self.to_dict() + + return json.dumps(dictionary, indent=2, sort_keys=True) + "\n" + + def to_json_file(self, json_file_path: Union[str, os.PathLike]): + """ + Save this instance to a JSON file. + + Args: + json_file_path (`str` or `os.PathLike`): + Path to the JSON file in which this processor instance's parameters will be saved. + """ + with open(json_file_path, "w", encoding="utf-8") as writer: + writer.write(self.to_json_string()) + def __repr__(self): attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.attributes] attributes_repr = "\n".join(attributes_repr) - return f"{self.__class__.__name__}:\n{attributes_repr}" + return f"{self.__class__.__name__}:\n{attributes_repr}\n\n{self.to_json_string()}" def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs): """ @@ -139,6 +213,7 @@ class ProcessorMixin(PushToHubMixin): if self._auto_class is not None: attrs = [getattr(self, attribute_name) for attribute_name in self.attributes] configs = [(a.init_kwargs if isinstance(a, PreTrainedTokenizerBase) else a) for a in attrs] + configs.append(self) custom_object_save(self, save_directory, config=configs) for attribute_name in self.attributes: @@ -156,6 +231,12 @@ class ProcessorMixin(PushToHubMixin): if isinstance(attribute, PreTrainedTokenizerBase): del attribute.init_kwargs["auto_map"] + # If we save using the predefined names, we can load using `from_pretrained` + output_processor_file = os.path.join(save_directory, PROCESSOR_NAME) + + self.to_json_file(output_processor_file) + logger.info(f"processor saved in {output_processor_file}") + if push_to_hub: self._upload_modified_files( save_directory, @@ -165,6 +246,150 @@ class ProcessorMixin(PushToHubMixin): token=kwargs.get("token"), ) + return [output_processor_file] + + @classmethod + def get_processor_dict( + cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: + """ + From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a + processor of type [`~processing_utils.ProcessingMixin`] using `from_args_and_dict`. + + Parameters: + pretrained_model_name_or_path (`str` or `os.PathLike`): + The identifier of the pre-trained checkpoint from which we want the dictionary of parameters. + subfolder (`str`, *optional*, defaults to `""`): + In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can + specify the folder name here. + + Returns: + `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the processor object. + """ + cache_dir = kwargs.pop("cache_dir", None) + force_download = kwargs.pop("force_download", False) + resume_download = kwargs.pop("resume_download", False) + proxies = kwargs.pop("proxies", None) + token = kwargs.pop("token", None) + local_files_only = kwargs.pop("local_files_only", False) + revision = kwargs.pop("revision", None) + subfolder = kwargs.pop("subfolder", "") + + from_pipeline = kwargs.pop("_from_pipeline", None) + from_auto_class = kwargs.pop("_from_auto", False) + + user_agent = {"file_type": "processor", "from_auto_class": from_auto_class} + if from_pipeline is not None: + user_agent["using_pipeline"] = from_pipeline + + if is_offline_mode() and not local_files_only: + logger.info("Offline mode: forcing local_files_only=True") + local_files_only = True + + pretrained_model_name_or_path = str(pretrained_model_name_or_path) + is_local = os.path.isdir(pretrained_model_name_or_path) + if os.path.isdir(pretrained_model_name_or_path): + processor_file = os.path.join(pretrained_model_name_or_path, PROCESSOR_NAME) + if os.path.isfile(pretrained_model_name_or_path): + resolved_processor_file = pretrained_model_name_or_path + is_local = True + elif is_remote_url(pretrained_model_name_or_path): + processor_file = pretrained_model_name_or_path + resolved_processor_file = download_url(pretrained_model_name_or_path) + else: + processor_file = PROCESSOR_NAME + try: + # Load from local folder or from cache or download from model Hub and cache + resolved_processor_file = cached_file( + pretrained_model_name_or_path, + processor_file, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + token=token, + user_agent=user_agent, + revision=revision, + subfolder=subfolder, + ) + except EnvironmentError: + # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to + # the original exception. + raise + except Exception: + # For any other exception, we throw a generic error. + raise EnvironmentError( + f"Can't load processor for '{pretrained_model_name_or_path}'. If you were trying to load" + " it from 'https://huggingface.co/models', make sure you don't have a local directory with the" + f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a" + f" directory containing a {PROCESSOR_NAME} file" + ) + + try: + # Load processor dict + with open(resolved_processor_file, "r", encoding="utf-8") as reader: + text = reader.read() + processor_dict = json.loads(text) + + except json.JSONDecodeError: + raise EnvironmentError( + f"It looks like the config file at '{resolved_processor_file}' is not a valid JSON file." + ) + + if is_local: + logger.info(f"loading configuration file {resolved_processor_file}") + else: + logger.info(f"loading configuration file {processor_file} from cache at {resolved_processor_file}") + + if "auto_map" in processor_dict and not is_local: + processor_dict["auto_map"] = add_model_info_to_auto_map( + processor_dict["auto_map"], pretrained_model_name_or_path + ) + + return processor_dict, kwargs + + @classmethod + def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs): + """ + Instantiates a type of [`~processing_utils.ProcessingMixin`] from a Python dictionary of parameters. + + Args: + processor_dict (`Dict[str, Any]`): + Dictionary that will be used to instantiate the processor object. Such a dictionary can be + retrieved from a pretrained checkpoint by leveraging the + [`~processing_utils.ProcessingMixin.to_dict`] method. + kwargs (`Dict[str, Any]`): + Additional parameters from which to initialize the processor object. + + Returns: + [`~processing_utils.ProcessingMixin`]: The processor object instantiated from those + parameters. + """ + processor_dict = processor_dict.copy() + return_unused_kwargs = kwargs.pop("return_unused_kwargs", False) + + # Unlike image processors or feature extractors whose `__init__` accept `kwargs`, processor don't have `kwargs`. + # We have to pop up some unused (but specific) arguments to make it work. + if "processor_class" in processor_dict: + del processor_dict["processor_class"] + + if "auto_map" in processor_dict: + del processor_dict["auto_map"] + + processor = cls(*args, **processor_dict) + + # Update processor with kwargs if needed + for key in set(kwargs.keys()): + if hasattr(processor, key): + setattr(processor, key, kwargs.pop(key)) + + logger.info(f"Processor {processor}") + if return_unused_kwargs: + return processor, kwargs + else: + return processor + @classmethod def from_pretrained( cls, @@ -226,7 +451,19 @@ class ProcessorMixin(PushToHubMixin): kwargs["token"] = token args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs) - return cls(*args) + + # Existing processors on the Hub created before #27761 being merged don't have `processor_config.json` (if not + # updated afterward), and we need to keep `from_pretrained` work. So here it fallbacks to the empty dict. + # However, for models added in the future, we won't get the expected error if this file is missing. + try: + processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs) + except EnvironmentError as e: + if "does not appear to have a file named processor_config.json." in str(e): + processor_dict, kwargs = {}, kwargs + else: + raise + + return cls.from_args_and_dict(args, processor_dict, **kwargs) @classmethod def register_for_auto_class(cls, auto_class="AutoProcessor"): diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py index 780090aec5..bb05dd28ef 100644 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -217,6 +217,7 @@ SAFE_WEIGHTS_INDEX_NAME = "model.safetensors.index.json" CONFIG_NAME = "config.json" FEATURE_EXTRACTOR_NAME = "preprocessor_config.json" IMAGE_PROCESSOR_NAME = FEATURE_EXTRACTOR_NAME +PROCESSOR_NAME = "processor_config.json" GENERATION_CONFIG_NAME = "generation_config.json" MODEL_CARD_NAME = "modelcard.json" diff --git a/tests/models/auto/test_processor_auto.py b/tests/models/auto/test_processor_auto.py index bf4a92475d..c22013234f 100644 --- a/tests/models/auto/test_processor_auto.py +++ b/tests/models/auto/test_processor_auto.py @@ -42,7 +42,7 @@ from transformers import ( ) from transformers.testing_utils import TOKEN, USER, get_tests_dir, is_staging_test from transformers.tokenization_utils import TOKENIZER_CONFIG_FILE -from transformers.utils import FEATURE_EXTRACTOR_NAME, is_tokenizers_available +from transformers.utils import FEATURE_EXTRACTOR_NAME, PROCESSOR_NAME, is_tokenizers_available sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils")) @@ -91,6 +91,28 @@ class AutoFeatureExtractorTest(unittest.TestCase): self.assertIsInstance(processor, Wav2Vec2Processor) + def test_processor_from_processor_class(self): + with tempfile.TemporaryDirectory() as tmpdirname: + feature_extractor = Wav2Vec2FeatureExtractor() + tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h") + + processor = Wav2Vec2Processor(feature_extractor, tokenizer) + + # save in new folder + processor.save_pretrained(tmpdirname) + + # drop `processor_class` in tokenizer config + with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE), "r") as f: + config_dict = json.load(f) + config_dict.pop("processor_class") + + with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE), "w") as f: + f.write(json.dumps(config_dict)) + + processor = AutoProcessor.from_pretrained(tmpdirname) + + self.assertIsInstance(processor, Wav2Vec2Processor) + def test_processor_from_feat_extr_processor_class(self): with tempfile.TemporaryDirectory() as tmpdirname: feature_extractor = Wav2Vec2FeatureExtractor() @@ -101,6 +123,14 @@ class AutoFeatureExtractorTest(unittest.TestCase): # save in new folder processor.save_pretrained(tmpdirname) + # drop `processor_class` in processor + with open(os.path.join(tmpdirname, PROCESSOR_NAME), "r") as f: + config_dict = json.load(f) + config_dict.pop("processor_class") + + with open(os.path.join(tmpdirname, PROCESSOR_NAME), "w") as f: + f.write(json.dumps(config_dict)) + # drop `processor_class` in tokenizer with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE), "r") as f: config_dict = json.load(f) @@ -123,6 +153,14 @@ class AutoFeatureExtractorTest(unittest.TestCase): # save in new folder processor.save_pretrained(tmpdirname) + # drop `processor_class` in processor + with open(os.path.join(tmpdirname, PROCESSOR_NAME), "r") as f: + config_dict = json.load(f) + config_dict.pop("processor_class") + + with open(os.path.join(tmpdirname, PROCESSOR_NAME), "w") as f: + f.write(json.dumps(config_dict)) + # drop `processor_class` in feature extractor with open(os.path.join(tmpdirname, FEATURE_EXTRACTOR_NAME), "r") as f: config_dict = json.load(f) @@ -270,6 +308,45 @@ class AutoFeatureExtractorTest(unittest.TestCase): if CustomConfig in PROCESSOR_MAPPING._extra_content: del PROCESSOR_MAPPING._extra_content[CustomConfig] + def test_from_pretrained_dynamic_processor_with_extra_attributes(self): + class NewFeatureExtractor(Wav2Vec2FeatureExtractor): + pass + + class NewTokenizer(BertTokenizer): + pass + + class NewProcessor(ProcessorMixin): + feature_extractor_class = "AutoFeatureExtractor" + tokenizer_class = "AutoTokenizer" + + def __init__(self, feature_extractor, tokenizer, processor_attr_1=1, processor_attr_2=True): + super().__init__(feature_extractor, tokenizer) + + self.processor_attr_1 = processor_attr_1 + self.processor_attr_2 = processor_attr_2 + + try: + AutoConfig.register("custom", CustomConfig) + AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor) + AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer) + AutoProcessor.register(CustomConfig, NewProcessor) + # If remote code is not set, the default is to use local classes. + processor = AutoProcessor.from_pretrained( + "hf-internal-testing/test_dynamic_processor", processor_attr_2=False + ) + self.assertEqual(processor.__class__.__name__, "NewProcessor") + self.assertEqual(processor.processor_attr_1, 1) + self.assertEqual(processor.processor_attr_2, False) + finally: + if "custom" in CONFIG_MAPPING._extra_content: + del CONFIG_MAPPING._extra_content["custom"] + if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content: + del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig] + if CustomConfig in TOKENIZER_MAPPING._extra_content: + del TOKENIZER_MAPPING._extra_content[CustomConfig] + if CustomConfig in PROCESSOR_MAPPING._extra_content: + del PROCESSOR_MAPPING._extra_content[CustomConfig] + def test_auto_processor_creates_tokenizer(self): processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-bert") self.assertEqual(processor.__class__.__name__, "BertTokenizerFast") diff --git a/tests/models/clip/test_processor_clip.py b/tests/models/clip/test_processor_clip.py index fb88ef2705..a76d3b33b8 100644 --- a/tests/models/clip/test_processor_clip.py +++ b/tests/models/clip/test_processor_clip.py @@ -26,6 +26,8 @@ from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES from transformers.testing_utils import require_vision from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available +from ...test_processing_common import ProcessorTesterMixin + if is_vision_available(): from PIL import Image @@ -34,7 +36,9 @@ if is_vision_available(): @require_vision -class CLIPProcessorTest(unittest.TestCase): +class CLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = CLIPProcessor + def setUp(self): self.tmpdirname = tempfile.mkdtemp() diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py new file mode 100644 index 0000000000..1ab215e34c --- /dev/null +++ b/tests/test_processing_common.py @@ -0,0 +1,127 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import tempfile +import unittest + +from transformers import CLIPTokenizerFast, ProcessorMixin +from transformers.models.auto.processing_auto import processor_class_from_name +from transformers.testing_utils import ( + check_json_file_has_correct_format, + require_tokenizers, + require_torch, + require_vision, +) +from transformers.utils import is_vision_available + + +if is_vision_available(): + from transformers import CLIPImageProcessor + + +@require_torch +class ProcessorTesterMixin: + processor_class = None + + def prepare_processor_dict(self): + return {} + + def get_component(self, attribute, **kwargs): + assert attribute in self.processor_class.attributes + component_class_name = getattr(self.processor_class, f"{attribute}_class") + if isinstance(component_class_name, tuple): + component_class_name = component_class_name[0] + + component_class = processor_class_from_name(component_class_name) + component = component_class.from_pretrained(self.tmpdirname, **kwargs) # noqa + + return component + + def prepare_components(self): + components = {} + for attribute in self.processor_class.attributes: + component = self.get_component(attribute) + components[attribute] = component + + return components + + def get_processor(self): + components = self.prepare_components() + processor = self.processor_class(**components, **self.prepare_processor_dict()) + return processor + + def test_processor_to_json_string(self): + processor = self.get_processor() + obj = json.loads(processor.to_json_string()) + for key, value in self.prepare_processor_dict().items(): + self.assertEqual(obj[key], value) + self.assertEqual(getattr(processor, key, None), value) + + def test_processor_from_and_save_pretrained(self): + processor_first = self.get_processor() + + with tempfile.TemporaryDirectory() as tmpdirname: + saved_file = processor_first.save_pretrained(tmpdirname)[0] + check_json_file_has_correct_format(saved_file) + processor_second = self.processor_class.from_pretrained(tmpdirname) + + self.assertEqual(processor_second.to_dict(), processor_first.to_dict()) + + +class MyProcessor(ProcessorMixin): + attributes = ["image_processor", "tokenizer"] + image_processor_class = "CLIPImageProcessor" + tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast") + + def __init__(self, image_processor=None, tokenizer=None, processor_attr_1=1, processor_attr_2=True): + super().__init__(image_processor, tokenizer) + + self.processor_attr_1 = processor_attr_1 + self.processor_attr_2 = processor_attr_2 + + +@require_tokenizers +@require_vision +class ProcessorTest(unittest.TestCase): + processor_class = MyProcessor + + def prepare_processor_dict(self): + return {"processor_attr_1": 1, "processor_attr_2": False} + + def get_processor(self): + image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14") + tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14") + processor = MyProcessor(image_processor, tokenizer, **self.prepare_processor_dict()) + + return processor + + def test_processor_to_json_string(self): + processor = self.get_processor() + obj = json.loads(processor.to_json_string()) + for key, value in self.prepare_processor_dict().items(): + self.assertEqual(obj[key], value) + self.assertEqual(getattr(processor, key, None), value) + + def test_processor_from_and_save_pretrained(self): + processor_first = self.get_processor() + + with tempfile.TemporaryDirectory() as tmpdirname: + saved_file = processor_first.save_pretrained(tmpdirname)[0] + check_json_file_has_correct_format(saved_file) + processor_second = self.processor_class.from_pretrained(tmpdirname) + + self.assertEqual(processor_second.to_dict(), processor_first.to_dict())