Fast image processor (#28847)

* Draft fast image processors * Draft working fast version * py3.8 compatible cache * Enable loading fast image processors through auto * Tidy up; rescale behaviour based on input type * Enable tests for fast image processors * Smarter rescaling * Don't default to Fast * Safer imports * Add necessary Pillow requirement * Woops * Add AutoImageProcessor test * Fix up * Fix test for imagegpt * Fix test * Review comments * Add warning for TF and JAX input types * Rearrange * Return transforms * NumpyToTensor transformation * Rebase - include changes from upstream in ImageProcessingMixin * Safe typing * Fix up * convert mean/std to tesnor to rescale * Don't store transforms in state * Fix up * Update src/transformers/image_processing_utils_fast.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update src/transformers/models/auto/image_processing_auto.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update src/transformers/models/auto/image_processing_auto.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update src/transformers/models/auto/image_processing_auto.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Warn if fast image processor available * Update src/transformers/models/vit/image_processing_vit_fast.py * Transpose incoming numpy images to be in CHW format * Update mapping names based on packages, auto set fast to None * Fix up * Fix * Add AutoImageProcessor.from_pretrained(checkpoint, use_fast=True) test * Update src/transformers/models/vit/image_processing_vit_fast.py Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com> * Add equivalence and speed tests * Fix up --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
2024-06-11 15:47:38 +01:00
parent edc1dffd00
commit f53fe35b29
64 changed files with 1645 additions and 813 deletions
--- a/docs/source/en/main_classes/image_processor.md
+++ b/docs/source/en/main_classes/image_processor.md
@@ -32,3 +32,8 @@ An image processor is in charge of preparing input features for vision models an
 ## BaseImageProcessor
 [[autodoc]] image_processing_utils.BaseImageProcessor
 ## BaseImageProcessorFast
 [[autodoc]] image_processing_utils_fast.BaseImageProcessorFast
--- a/docs/source/en/model_doc/vit.md
+++ b/docs/source/en/model_doc/vit.md
@@ -62,7 +62,7 @@ Following the original Vision Transformer, some follow-up works have been made:
 This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code (written in JAX) can be
 found [here](https://github.com/google-research/vision_transformer).
-Note that we converted the weights from Ross Wightman's [timm library](https://github.com/rwightman/pytorch-image-models), 
+Note that we converted the weights from Ross Wightman's [timm library](https://github.com/rwightman/pytorch-image-models),
 who already converted the weights from JAX to PyTorch. Credits go to him!
 ## Usage tips
@@ -158,6 +158,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] ViTImageProcessor
    - preprocess
 ## ViTImageProcessorFast
 [[autodoc]] ViTImageProcessorFast
    - preprocess
 <frameworkcontent>
 <pt>
--- a/examples/pytorch/_tests_requirements.txt
+++ b/examples/pytorch/_tests_requirements.txt
@@ -29,3 +29,4 @@ timm
 albumentations >= 1.4.5
 torchmetrics
 pycocotools
 Pillow>=10.0.1,<=15.0
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -1104,7 +1104,8 @@ except OptionalDependencyNotAvailable:
        name for name in dir(dummy_vision_objects) if not name.startswith("_")
    ]
 else:
-    _import_structure["image_processing_utils"] = ["ImageProcessingMixin"]
+    _import_structure["image_processing_base"] = ["ImageProcessingMixin"]
    _import_structure["image_processing_utils"] = ["BaseImageProcessor"]
    _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
    _import_structure["models.beit"].extend(["BeitFeatureExtractor", "BeitImageProcessor"])
    _import_structure["models.bit"].extend(["BitImageProcessor"])
@@ -1167,6 +1168,18 @@ else:
    _import_structure["models.vivit"].append("VivitImageProcessor")
    _import_structure["models.yolos"].extend(["YolosFeatureExtractor", "YolosImageProcessor"])
 try:
    if not is_torchvision_available():
        raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
    from .utils import dummy_torchvision_objects
    _import_structure["utils.dummy_torchvision_objects"] = [
        name for name in dir(dummy_torchvision_objects) if not name.startswith("_")
    ]
 else:
    _import_structure["image_processing_utils_fast"] = ["BaseImageProcessorFast"]
    _import_structure["models.vit"].append("ViTImageProcessorFast")
 # PyTorch-backed objects
 try:
@@ -5703,7 +5716,8 @@ if TYPE_CHECKING:
    except OptionalDependencyNotAvailable:
        from .utils.dummy_vision_objects import *
    else:
-        from .image_processing_utils import ImageProcessingMixin
+        from .image_processing_base import ImageProcessingMixin
        from .image_processing_utils import BaseImageProcessor
        from .image_utils import ImageFeatureExtractionMixin
        from .models.beit import BeitFeatureExtractor, BeitImageProcessor
        from .models.bit import BitImageProcessor
@@ -5793,6 +5807,15 @@ if TYPE_CHECKING:
        from .models.vivit import VivitImageProcessor
        from .models.yolos import YolosFeatureExtractor, YolosImageProcessor
    try:
        if not is_torchvision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        from .utils.dummy_torchvision_objects import *
    else:
        from .image_processing_utils_fast import BaseImageProcessorFast
        from .models.vit import ViTImageProcessorFast
    # Modeling
    try:
        if not is_torch_available():
--- a/src/transformers/image_processing_base.py
+++ b/src/transformers/image_processing_base.py
@@ -0,0 +1,554 @@
 # coding=utf-8
 # Copyright 2020 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import copy
 import json
 import os
 import warnings
 from io import BytesIO
 from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
 import requests
 from .dynamic_module_utils import custom_object_save
 from .feature_extraction_utils import BatchFeature as BaseBatchFeature
 from .utils import (
    IMAGE_PROCESSOR_NAME,
    PushToHubMixin,
    add_model_info_to_auto_map,
    add_model_info_to_custom_pipelines,
    cached_file,
    copy_func,
    download_url,
    is_offline_mode,
    is_remote_url,
    is_vision_available,
    logging,
 )
 if is_vision_available():
    from PIL import Image
 logger = logging.get_logger(__name__)
 # TODO: Move BatchFeature to be imported by both image_processing_utils and image_processing_utils
 # We override the class string here, but logic is the same.
 class BatchFeature(BaseBatchFeature):
    r"""
    Holds the output of the image processor specific `__call__` methods.
    This class is derived from a python dictionary and can be used as a dictionary.
    Args:
        data (`dict`):
            Dictionary of lists/arrays/tensors returned by the __call__ method ('pixel_values', etc.).
        tensor_type (`Union[None, str, TensorType]`, *optional*):
            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
            initialization.
    """
 # TODO: (Amy) - factor out the common parts of this and the feature extractor
 class ImageProcessingMixin(PushToHubMixin):
    """
    This is an image processor mixin used to provide saving/loading functionality for sequential and image feature
    extractors.
    """
    _auto_class = None
    def __init__(self, **kwargs):
        """Set elements of `kwargs` as attributes."""
        # This key was saved while we still used `XXXFeatureExtractor` for image processing. Now we use
        # `XXXImageProcessor`, this attribute and its value are misleading.
        kwargs.pop("feature_extractor_type", None)
        # Pop "processor_class" as it should be saved as private attribute
        self._processor_class = kwargs.pop("processor_class", None)
        # Additional attributes without default values
        for key, value in kwargs.items():
            try:
                setattr(self, key, value)
            except AttributeError as err:
                logger.error(f"Can't set {key} with value {value} for {self}")
                raise err
    def _set_processor_class(self, processor_class: str):
        """Sets processor class as an attribute."""
        self._processor_class = processor_class
    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: Union[str, os.PathLike],
        cache_dir: Optional[Union[str, os.PathLike]] = None,
        force_download: bool = False,
        local_files_only: bool = False,
        token: Optional[Union[str, bool]] = None,
        revision: str = "main",
        **kwargs,
    ):
        r"""
        Instantiate a type of [`~image_processing_utils.ImageProcessingMixin`] from an image processor.
        Args:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                This can be either:
                - a string, the *model id* of a pretrained image_processor hosted inside a model repo on
                  huggingface.co.
                - a path to a *directory* containing a image processor file saved using the
                  [`~image_processing_utils.ImageProcessingMixin.save_pretrained`] method, e.g.,
                  `./my_model_directory/`.
                - a path or url to a saved image processor JSON *file*, e.g.,
                  `./my_model_directory/preprocessor_config.json`.
            cache_dir (`str` or `os.PathLike`, *optional*):
                Path to a directory in which a downloaded pretrained model image processor should be cached if the
                standard cache should not be used.
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force to (re-)download the image processor files and override the cached versions if
                they exist.
            resume_download:
                Deprecated and ignored. All downloads are now resumed by default when possible.
                Will be removed in v5 of Transformers.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
            token (`str` or `bool`, *optional*):
                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
            revision (`str`, *optional*, defaults to `"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                identifier allowed by git.
                <Tip>
                To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".
                </Tip>
            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
                If `False`, then this function returns just the final image processor object. If `True`, then this
                functions returns a `Tuple(image_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
                consisting of the key/value pairs whose keys are not image processor attributes: i.e., the part of
                `kwargs` which has not been used to update `image_processor` and is otherwise ignored.
            subfolder (`str`, *optional*, defaults to `""`):
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
                specify the folder name here.
            kwargs (`Dict[str, Any]`, *optional*):
                The values in kwargs of any keys which are image processor attributes will be used to override the
                loaded values. Behavior concerning key/value pairs whose keys are *not* image processor attributes is
                controlled by the `return_unused_kwargs` keyword parameter.
        Returns:
            A image processor of type [`~image_processing_utils.ImageProcessingMixin`].
        Examples:
        ```python
        # We can't instantiate directly the base class *ImageProcessingMixin* so let's show the examples on a
        # derived class: *CLIPImageProcessor*
        image_processor = CLIPImageProcessor.from_pretrained(
            "openai/clip-vit-base-patch32"
        )  # Download image_processing_config from huggingface.co and cache.
        image_processor = CLIPImageProcessor.from_pretrained(
            "./test/saved_model/"
        )  # E.g. image processor (or model) was saved using *save_pretrained('./test/saved_model/')*
        image_processor = CLIPImageProcessor.from_pretrained("./test/saved_model/preprocessor_config.json")
        image_processor = CLIPImageProcessor.from_pretrained(
            "openai/clip-vit-base-patch32", do_normalize=False, foo=False
        )
        assert image_processor.do_normalize is False
        image_processor, unused_kwargs = CLIPImageProcessor.from_pretrained(
            "openai/clip-vit-base-patch32", do_normalize=False, foo=False, return_unused_kwargs=True
        )
        assert image_processor.do_normalize is False
        assert unused_kwargs == {"foo": False}
        ```"""
        kwargs["cache_dir"] = cache_dir
        kwargs["force_download"] = force_download
        kwargs["local_files_only"] = local_files_only
        kwargs["revision"] = revision
        use_auth_token = kwargs.pop("use_auth_token", None)
        if use_auth_token is not None:
            warnings.warn(
                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
                FutureWarning,
            )
            if token is not None:
                raise ValueError(
                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
                )
            token = use_auth_token
        if token is not None:
            kwargs["token"] = token
        image_processor_dict, kwargs = cls.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
        return cls.from_dict(image_processor_dict, **kwargs)
    def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
        """
        Save an image processor object to the directory `save_directory`, so that it can be re-loaded using the
        [`~image_processing_utils.ImageProcessingMixin.from_pretrained`] class method.
        Args:
            save_directory (`str` or `os.PathLike`):
                Directory where the image processor JSON file will be saved (will be created if it does not exist).
            push_to_hub (`bool`, *optional*, defaults to `False`):
                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                namespace).
            kwargs (`Dict[str, Any]`, *optional*):
                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
        """
        use_auth_token = kwargs.pop("use_auth_token", None)
        if use_auth_token is not None:
            warnings.warn(
                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
                FutureWarning,
            )
            if kwargs.get("token", None) is not None:
                raise ValueError(
                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
                )
            kwargs["token"] = use_auth_token
        if os.path.isfile(save_directory):
            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
        os.makedirs(save_directory, exist_ok=True)
        if push_to_hub:
            commit_message = kwargs.pop("commit_message", None)
            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
            repo_id = self._create_repo(repo_id, **kwargs)
            files_timestamps = self._get_files_timestamps(save_directory)
        # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
        # loaded from the Hub.
        if self._auto_class is not None:
            custom_object_save(self, save_directory, config=self)
        # If we save using the predefined names, we can load using `from_pretrained`
        output_image_processor_file = os.path.join(save_directory, IMAGE_PROCESSOR_NAME)
        self.to_json_file(output_image_processor_file)
        logger.info(f"Image processor saved in {output_image_processor_file}")
        if push_to_hub:
            self._upload_modified_files(
                save_directory,
                repo_id,
                files_timestamps,
                commit_message=commit_message,
                token=kwargs.get("token"),
            )
        return [output_image_processor_file]
    @classmethod
    def get_image_processor_dict(
        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
        """
        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
        image processor of type [`~image_processor_utils.ImageProcessingMixin`] using `from_dict`.
        Parameters:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
            subfolder (`str`, *optional*, defaults to `""`):
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
                specify the folder name here.
        Returns:
            `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object.
        """
        cache_dir = kwargs.pop("cache_dir", None)
        force_download = kwargs.pop("force_download", False)
        resume_download = kwargs.pop("resume_download", None)
        proxies = kwargs.pop("proxies", None)
        token = kwargs.pop("token", None)
        use_auth_token = kwargs.pop("use_auth_token", None)
        local_files_only = kwargs.pop("local_files_only", False)
        revision = kwargs.pop("revision", None)
        subfolder = kwargs.pop("subfolder", "")
        from_pipeline = kwargs.pop("_from_pipeline", None)
        from_auto_class = kwargs.pop("_from_auto", False)
        if use_auth_token is not None:
            warnings.warn(
                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
                FutureWarning,
            )
            if token is not None:
                raise ValueError(
                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
                )
            token = use_auth_token
        user_agent = {"file_type": "image processor", "from_auto_class": from_auto_class}
        if from_pipeline is not None:
            user_agent["using_pipeline"] = from_pipeline
        if is_offline_mode() and not local_files_only:
            logger.info("Offline mode: forcing local_files_only=True")
            local_files_only = True
        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
        is_local = os.path.isdir(pretrained_model_name_or_path)
        if os.path.isdir(pretrained_model_name_or_path):
            image_processor_file = os.path.join(pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME)
        if os.path.isfile(pretrained_model_name_or_path):
            resolved_image_processor_file = pretrained_model_name_or_path
            is_local = True
        elif is_remote_url(pretrained_model_name_or_path):
            image_processor_file = pretrained_model_name_or_path
            resolved_image_processor_file = download_url(pretrained_model_name_or_path)
        else:
            image_processor_file = IMAGE_PROCESSOR_NAME
            try:
                # Load from local folder or from cache or download from model Hub and cache
                resolved_image_processor_file = cached_file(
                    pretrained_model_name_or_path,
                    image_processor_file,
                    cache_dir=cache_dir,
                    force_download=force_download,
                    proxies=proxies,
                    resume_download=resume_download,
                    local_files_only=local_files_only,
                    token=token,
                    user_agent=user_agent,
                    revision=revision,
                    subfolder=subfolder,
                )
            except EnvironmentError:
                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
                # the original exception.
                raise
            except Exception:
                # For any other exception, we throw a generic error.
                raise EnvironmentError(
                    f"Can't load image processor for '{pretrained_model_name_or_path}'. If you were trying to load"
                    " it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
                    f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
                    f" directory containing a {IMAGE_PROCESSOR_NAME} file"
                )
        try:
            # Load image_processor dict
            with open(resolved_image_processor_file, "r", encoding="utf-8") as reader:
                text = reader.read()
            image_processor_dict = json.loads(text)
        except json.JSONDecodeError:
            raise EnvironmentError(
                f"It looks like the config file at '{resolved_image_processor_file}' is not a valid JSON file."
            )
        if is_local:
            logger.info(f"loading configuration file {resolved_image_processor_file}")
        else:
            logger.info(
                f"loading configuration file {image_processor_file} from cache at {resolved_image_processor_file}"
            )
        if not is_local:
            if "auto_map" in image_processor_dict:
                image_processor_dict["auto_map"] = add_model_info_to_auto_map(
                    image_processor_dict["auto_map"], pretrained_model_name_or_path
                )
            if "custom_pipelines" in image_processor_dict:
                image_processor_dict["custom_pipelines"] = add_model_info_to_custom_pipelines(
                    image_processor_dict["custom_pipelines"], pretrained_model_name_or_path
                )
        return image_processor_dict, kwargs
    @classmethod
    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
        """
        Instantiates a type of [`~image_processing_utils.ImageProcessingMixin`] from a Python dictionary of parameters.
        Args:
            image_processor_dict (`Dict[str, Any]`):
                Dictionary that will be used to instantiate the image processor object. Such a dictionary can be
                retrieved from a pretrained checkpoint by leveraging the
                [`~image_processing_utils.ImageProcessingMixin.to_dict`] method.
            kwargs (`Dict[str, Any]`):
                Additional parameters from which to initialize the image processor object.
        Returns:
            [`~image_processing_utils.ImageProcessingMixin`]: The image processor object instantiated from those
            parameters.
        """
        image_processor_dict = image_processor_dict.copy()
        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
        # The `size` parameter is a dict and was previously an int or tuple in feature extractors.
        # We set `size` here directly to the `image_processor_dict` so that it is converted to the appropriate
        # dict within the image processor and isn't overwritten if `size` is passed in as a kwarg.
        if "size" in kwargs and "size" in image_processor_dict:
            image_processor_dict["size"] = kwargs.pop("size")
        if "crop_size" in kwargs and "crop_size" in image_processor_dict:
            image_processor_dict["crop_size"] = kwargs.pop("crop_size")
        image_processor = cls(**image_processor_dict)
        # Update image_processor with kwargs if needed
        to_remove = []
        for key, value in kwargs.items():
            if hasattr(image_processor, key):
                setattr(image_processor, key, value)
                to_remove.append(key)
        for key in to_remove:
            kwargs.pop(key, None)
        logger.info(f"Image processor {image_processor}")
        if return_unused_kwargs:
            return image_processor, kwargs
        else:
            return image_processor
    def to_dict(self) -> Dict[str, Any]:
        """
        Serializes this instance to a Python dictionary.
        Returns:
            `Dict[str, Any]`: Dictionary of all the attributes that make up this image processor instance.
        """
        output = copy.deepcopy(self.__dict__)
        output["image_processor_type"] = self.__class__.__name__
        return output
    @classmethod
    def from_json_file(cls, json_file: Union[str, os.PathLike]):
        """
        Instantiates a image processor of type [`~image_processing_utils.ImageProcessingMixin`] from the path to a JSON
        file of parameters.
        Args:
            json_file (`str` or `os.PathLike`):
                Path to the JSON file containing the parameters.
        Returns:
            A image processor of type [`~image_processing_utils.ImageProcessingMixin`]: The image_processor object
            instantiated from that JSON file.
        """
        with open(json_file, "r", encoding="utf-8") as reader:
            text = reader.read()
        image_processor_dict = json.loads(text)
        return cls(**image_processor_dict)
    def to_json_string(self) -> str:
        """
        Serializes this instance to a JSON string.
        Returns:
            `str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
        """
        dictionary = self.to_dict()
        for key, value in dictionary.items():
            if isinstance(value, np.ndarray):
                dictionary[key] = value.tolist()
        # make sure private name "_processor_class" is correctly
        # saved as "processor_class"
        _processor_class = dictionary.pop("_processor_class", None)
        if _processor_class is not None:
            dictionary["processor_class"] = _processor_class
        return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
        """
        Save this instance to a JSON file.
        Args:
            json_file_path (`str` or `os.PathLike`):
                Path to the JSON file in which this image_processor instance's parameters will be saved.
        """
        with open(json_file_path, "w", encoding="utf-8") as writer:
            writer.write(self.to_json_string())
    def __repr__(self):
        return f"{self.__class__.__name__} {self.to_json_string()}"
    @classmethod
    def register_for_auto_class(cls, auto_class="AutoImageProcessor"):
        """
        Register this class with a given auto class. This should only be used for custom image processors as the ones
        in the library are already mapped with `AutoImageProcessor `.
        <Tip warning={true}>
        This API is experimental and may have some slight breaking changes in the next releases.
        </Tip>
        Args:
            auto_class (`str` or `type`, *optional*, defaults to `"AutoImageProcessor "`):
                The auto class to register this new image processor with.
        """
        if not isinstance(auto_class, str):
            auto_class = auto_class.__name__
        import transformers.models.auto as auto_module
        if not hasattr(auto_module, auto_class):
            raise ValueError(f"{auto_class} is not a valid auto class.")
        cls._auto_class = auto_class
    def fetch_images(self, image_url_or_urls: Union[str, List[str]]):
        """
        Convert a single or a list of urls into the corresponding `PIL.Image` objects.
        If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
        returned.
        """
        headers = {
            "User-Agent": (
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0"
                " Safari/537.36"
            )
        }
        if isinstance(image_url_or_urls, list):
            return [self.fetch_images(x) for x in image_url_or_urls]
        elif isinstance(image_url_or_urls, str):
            response = requests.get(image_url_or_urls, stream=True, headers=headers)
            response.raise_for_status()
            return Image.open(BytesIO(response.content))
        else:
            raise ValueError(f"only a single or a list of entries is supported but got type={type(image_url_or_urls)}")
 ImageProcessingMixin.push_to_hub = copy_func(ImageProcessingMixin.push_to_hub)
 if ImageProcessingMixin.push_to_hub.__doc__ is not None:
    ImageProcessingMixin.push_to_hub.__doc__ = ImageProcessingMixin.push_to_hub.__doc__.format(
        object="image processor", object_class="AutoImageProcessor", object_files="image processor file"
    )
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -13,38 +13,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import copy
+from typing import Dict, Iterable, Optional, Union
 import json
 import os
 import warnings
 from io import BytesIO
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 import numpy as np
 import requests
-from .dynamic_module_utils import custom_object_save
+from .image_processing_base import BatchFeature, ImageProcessingMixin
 from .feature_extraction_utils import BatchFeature as BaseBatchFeature
 from .image_transforms import center_crop, normalize, rescale
 from .image_utils import ChannelDimension
-from .utils import (
+from .utils import logging
    IMAGE_PROCESSOR_NAME,
    PushToHubMixin,
    add_model_info_to_auto_map,
    add_model_info_to_custom_pipelines,
    cached_file,
    copy_func,
    download_url,
    is_offline_mode,
    is_remote_url,
    is_vision_available,
    logging,
 )
 if is_vision_available():
    from PIL import Image
 logger = logging.get_logger(__name__)
@@ -54,505 +32,6 @@ INIT_SERVICE_KWARGS = [
 ]
 # TODO: Move BatchFeature to be imported by both image_processing_utils and image_processing_utils
 # We override the class string here, but logic is the same.
 class BatchFeature(BaseBatchFeature):
    r"""
    Holds the output of the image processor specific `__call__` methods.
    This class is derived from a python dictionary and can be used as a dictionary.
    Args:
        data (`dict`):
            Dictionary of lists/arrays/tensors returned by the __call__ method ('pixel_values', etc.).
        tensor_type (`Union[None, str, TensorType]`, *optional*):
            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
            initialization.
    """
 # TODO: (Amy) - factor out the common parts of this and the feature extractor
 class ImageProcessingMixin(PushToHubMixin):
    """
    This is an image processor mixin used to provide saving/loading functionality for sequential and image feature
    extractors.
    """
    _auto_class = None
    def __init__(self, **kwargs):
        """Set elements of `kwargs` as attributes."""
        # This key was saved while we still used `XXXFeatureExtractor` for image processing. Now we use
        # `XXXImageProcessor`, this attribute and its value are misleading.
        kwargs.pop("feature_extractor_type", None)
        # Pop "processor_class" as it should be saved as private attribute
        self._processor_class = kwargs.pop("processor_class", None)
        # Additional attributes without default values
        for key, value in kwargs.items():
            try:
                setattr(self, key, value)
            except AttributeError as err:
                logger.error(f"Can't set {key} with value {value} for {self}")
                raise err
    def _set_processor_class(self, processor_class: str):
        """Sets processor class as an attribute."""
        self._processor_class = processor_class
    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: Union[str, os.PathLike],
        cache_dir: Optional[Union[str, os.PathLike]] = None,
        force_download: bool = False,
        local_files_only: bool = False,
        token: Optional[Union[str, bool]] = None,
        revision: str = "main",
        **kwargs,
    ):
        r"""
        Instantiate a type of [`~image_processing_utils.ImageProcessingMixin`] from an image processor.
        Args:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                This can be either:
                - a string, the *model id* of a pretrained image_processor hosted inside a model repo on
                  huggingface.co.
                - a path to a *directory* containing a image processor file saved using the
                  [`~image_processing_utils.ImageProcessingMixin.save_pretrained`] method, e.g.,
                  `./my_model_directory/`.
                - a path or url to a saved image processor JSON *file*, e.g.,
                  `./my_model_directory/preprocessor_config.json`.
            cache_dir (`str` or `os.PathLike`, *optional*):
                Path to a directory in which a downloaded pretrained model image processor should be cached if the
                standard cache should not be used.
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force to (re-)download the image processor files and override the cached versions if
                they exist.
            resume_download:
                Deprecated and ignored. All downloads are now resumed by default when possible.
                Will be removed in v5 of Transformers.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
            token (`str` or `bool`, *optional*):
                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
            revision (`str`, *optional*, defaults to `"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                identifier allowed by git.
                <Tip>
                To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".
                </Tip>
            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
                If `False`, then this function returns just the final image processor object. If `True`, then this
                functions returns a `Tuple(image_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
                consisting of the key/value pairs whose keys are not image processor attributes: i.e., the part of
                `kwargs` which has not been used to update `image_processor` and is otherwise ignored.
            subfolder (`str`, *optional*, defaults to `""`):
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
                specify the folder name here.
            kwargs (`Dict[str, Any]`, *optional*):
                The values in kwargs of any keys which are image processor attributes will be used to override the
                loaded values. Behavior concerning key/value pairs whose keys are *not* image processor attributes is
                controlled by the `return_unused_kwargs` keyword parameter.
        Returns:
            A image processor of type [`~image_processing_utils.ImageProcessingMixin`].
        Examples:
        ```python
        # We can't instantiate directly the base class *ImageProcessingMixin* so let's show the examples on a
        # derived class: *CLIPImageProcessor*
        image_processor = CLIPImageProcessor.from_pretrained(
            "openai/clip-vit-base-patch32"
        )  # Download image_processing_config from huggingface.co and cache.
        image_processor = CLIPImageProcessor.from_pretrained(
            "./test/saved_model/"
        )  # E.g. image processor (or model) was saved using *save_pretrained('./test/saved_model/')*
        image_processor = CLIPImageProcessor.from_pretrained("./test/saved_model/preprocessor_config.json")
        image_processor = CLIPImageProcessor.from_pretrained(
            "openai/clip-vit-base-patch32", do_normalize=False, foo=False
        )
        assert image_processor.do_normalize is False
        image_processor, unused_kwargs = CLIPImageProcessor.from_pretrained(
            "openai/clip-vit-base-patch32", do_normalize=False, foo=False, return_unused_kwargs=True
        )
        assert image_processor.do_normalize is False
        assert unused_kwargs == {"foo": False}
        ```"""
        kwargs["cache_dir"] = cache_dir
        kwargs["force_download"] = force_download
        kwargs["local_files_only"] = local_files_only
        kwargs["revision"] = revision
        use_auth_token = kwargs.pop("use_auth_token", None)
        if use_auth_token is not None:
            warnings.warn(
                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
                FutureWarning,
            )
            if token is not None:
                raise ValueError(
                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
                )
            token = use_auth_token
        if token is not None:
            kwargs["token"] = token
        image_processor_dict, kwargs = cls.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
        return cls.from_dict(image_processor_dict, **kwargs)
    def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
        """
        Save an image processor object to the directory `save_directory`, so that it can be re-loaded using the
        [`~image_processing_utils.ImageProcessingMixin.from_pretrained`] class method.
        Args:
            save_directory (`str` or `os.PathLike`):
                Directory where the image processor JSON file will be saved (will be created if it does not exist).
            push_to_hub (`bool`, *optional*, defaults to `False`):
                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                namespace).
            kwargs (`Dict[str, Any]`, *optional*):
                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
        """
        use_auth_token = kwargs.pop("use_auth_token", None)
        if use_auth_token is not None:
            warnings.warn(
                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
                FutureWarning,
            )
            if kwargs.get("token", None) is not None:
                raise ValueError(
                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
                )
            kwargs["token"] = use_auth_token
        if os.path.isfile(save_directory):
            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
        os.makedirs(save_directory, exist_ok=True)
        if push_to_hub:
            commit_message = kwargs.pop("commit_message", None)
            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
            repo_id = self._create_repo(repo_id, **kwargs)
            files_timestamps = self._get_files_timestamps(save_directory)
        # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
        # loaded from the Hub.
        if self._auto_class is not None:
            custom_object_save(self, save_directory, config=self)
        # If we save using the predefined names, we can load using `from_pretrained`
        output_image_processor_file = os.path.join(save_directory, IMAGE_PROCESSOR_NAME)
        self.to_json_file(output_image_processor_file)
        logger.info(f"Image processor saved in {output_image_processor_file}")
        if push_to_hub:
            self._upload_modified_files(
                save_directory,
                repo_id,
                files_timestamps,
                commit_message=commit_message,
                token=kwargs.get("token"),
            )
        return [output_image_processor_file]
    @classmethod
    def get_image_processor_dict(
        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
        """
        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
        image processor of type [`~image_processor_utils.ImageProcessingMixin`] using `from_dict`.
        Parameters:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
            subfolder (`str`, *optional*, defaults to `""`):
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
                specify the folder name here.
        Returns:
            `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object.
        """
        cache_dir = kwargs.pop("cache_dir", None)
        force_download = kwargs.pop("force_download", False)
        resume_download = kwargs.pop("resume_download", None)
        proxies = kwargs.pop("proxies", None)
        token = kwargs.pop("token", None)
        use_auth_token = kwargs.pop("use_auth_token", None)
        local_files_only = kwargs.pop("local_files_only", False)
        revision = kwargs.pop("revision", None)
        subfolder = kwargs.pop("subfolder", "")
        from_pipeline = kwargs.pop("_from_pipeline", None)
        from_auto_class = kwargs.pop("_from_auto", False)
        if use_auth_token is not None:
            warnings.warn(
                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
                FutureWarning,
            )
            if token is not None:
                raise ValueError(
                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
                )
            token = use_auth_token
        user_agent = {"file_type": "image processor", "from_auto_class": from_auto_class}
        if from_pipeline is not None:
            user_agent["using_pipeline"] = from_pipeline
        if is_offline_mode() and not local_files_only:
            logger.info("Offline mode: forcing local_files_only=True")
            local_files_only = True
        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
        is_local = os.path.isdir(pretrained_model_name_or_path)
        if os.path.isdir(pretrained_model_name_or_path):
            image_processor_file = os.path.join(pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME)
        if os.path.isfile(pretrained_model_name_or_path):
            resolved_image_processor_file = pretrained_model_name_or_path
            is_local = True
        elif is_remote_url(pretrained_model_name_or_path):
            image_processor_file = pretrained_model_name_or_path
            resolved_image_processor_file = download_url(pretrained_model_name_or_path)
        else:
            image_processor_file = IMAGE_PROCESSOR_NAME
            try:
                # Load from local folder or from cache or download from model Hub and cache
                resolved_image_processor_file = cached_file(
                    pretrained_model_name_or_path,
                    image_processor_file,
                    cache_dir=cache_dir,
                    force_download=force_download,
                    proxies=proxies,
                    resume_download=resume_download,
                    local_files_only=local_files_only,
                    token=token,
                    user_agent=user_agent,
                    revision=revision,
                    subfolder=subfolder,
                )
            except EnvironmentError:
                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
                # the original exception.
                raise
            except Exception:
                # For any other exception, we throw a generic error.
                raise EnvironmentError(
                    f"Can't load image processor for '{pretrained_model_name_or_path}'. If you were trying to load"
                    " it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
                    f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
                    f" directory containing a {IMAGE_PROCESSOR_NAME} file"
                )
        try:
            # Load image_processor dict
            with open(resolved_image_processor_file, "r", encoding="utf-8") as reader:
                text = reader.read()
            image_processor_dict = json.loads(text)
        except json.JSONDecodeError:
            raise EnvironmentError(
                f"It looks like the config file at '{resolved_image_processor_file}' is not a valid JSON file."
            )
        if is_local:
            logger.info(f"loading configuration file {resolved_image_processor_file}")
        else:
            logger.info(
                f"loading configuration file {image_processor_file} from cache at {resolved_image_processor_file}"
            )
        if not is_local:
            if "auto_map" in image_processor_dict:
                image_processor_dict["auto_map"] = add_model_info_to_auto_map(
                    image_processor_dict["auto_map"], pretrained_model_name_or_path
                )
            if "custom_pipelines" in image_processor_dict:
                image_processor_dict["custom_pipelines"] = add_model_info_to_custom_pipelines(
                    image_processor_dict["custom_pipelines"], pretrained_model_name_or_path
                )
        return image_processor_dict, kwargs
    @classmethod
    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
        """
        Instantiates a type of [`~image_processing_utils.ImageProcessingMixin`] from a Python dictionary of parameters.
        Args:
            image_processor_dict (`Dict[str, Any]`):
                Dictionary that will be used to instantiate the image processor object. Such a dictionary can be
                retrieved from a pretrained checkpoint by leveraging the
                [`~image_processing_utils.ImageProcessingMixin.to_dict`] method.
            kwargs (`Dict[str, Any]`):
                Additional parameters from which to initialize the image processor object.
        Returns:
            [`~image_processing_utils.ImageProcessingMixin`]: The image processor object instantiated from those
            parameters.
        """
        image_processor_dict = image_processor_dict.copy()
        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
        # The `size` parameter is a dict and was previously an int or tuple in feature extractors.
        # We set `size` here directly to the `image_processor_dict` so that it is converted to the appropriate
        # dict within the image processor and isn't overwritten if `size` is passed in as a kwarg.
        if "size" in kwargs and "size" in image_processor_dict:
            image_processor_dict["size"] = kwargs.pop("size")
        if "crop_size" in kwargs and "crop_size" in image_processor_dict:
            image_processor_dict["crop_size"] = kwargs.pop("crop_size")
        image_processor = cls(**image_processor_dict)
        # Update image_processor with kwargs if needed
        to_remove = []
        for key, value in kwargs.items():
            if hasattr(image_processor, key):
                setattr(image_processor, key, value)
                to_remove.append(key)
        for key in to_remove:
            kwargs.pop(key, None)
        logger.info(f"Image processor {image_processor}")
        if return_unused_kwargs:
            return image_processor, kwargs
        else:
            return image_processor
    def to_dict(self) -> Dict[str, Any]:
        """
        Serializes this instance to a Python dictionary.
        Returns:
            `Dict[str, Any]`: Dictionary of all the attributes that make up this image processor instance.
        """
        output = copy.deepcopy(self.__dict__)
        output["image_processor_type"] = self.__class__.__name__
        return output
    @classmethod
    def from_json_file(cls, json_file: Union[str, os.PathLike]):
        """
        Instantiates a image processor of type [`~image_processing_utils.ImageProcessingMixin`] from the path to a JSON
        file of parameters.
        Args:
            json_file (`str` or `os.PathLike`):
                Path to the JSON file containing the parameters.
        Returns:
            A image processor of type [`~image_processing_utils.ImageProcessingMixin`]: The image_processor object
            instantiated from that JSON file.
        """
        with open(json_file, "r", encoding="utf-8") as reader:
            text = reader.read()
        image_processor_dict = json.loads(text)
        return cls(**image_processor_dict)
    def to_json_string(self) -> str:
        """
        Serializes this instance to a JSON string.
        Returns:
            `str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
        """
        dictionary = self.to_dict()
        for key, value in dictionary.items():
            if isinstance(value, np.ndarray):
                dictionary[key] = value.tolist()
        # make sure private name "_processor_class" is correctly
        # saved as "processor_class"
        _processor_class = dictionary.pop("_processor_class", None)
        if _processor_class is not None:
            dictionary["processor_class"] = _processor_class
        return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
        """
        Save this instance to a JSON file.
        Args:
            json_file_path (`str` or `os.PathLike`):
                Path to the JSON file in which this image_processor instance's parameters will be saved.
        """
        with open(json_file_path, "w", encoding="utf-8") as writer:
            writer.write(self.to_json_string())
    def __repr__(self):
        return f"{self.__class__.__name__} {self.to_json_string()}"
    @classmethod
    def register_for_auto_class(cls, auto_class="AutoImageProcessor"):
        """
        Register this class with a given auto class. This should only be used for custom image processors as the ones
        in the library are already mapped with `AutoImageProcessor `.
        <Tip warning={true}>
        This API is experimental and may have some slight breaking changes in the next releases.
        </Tip>
        Args:
            auto_class (`str` or `type`, *optional*, defaults to `"AutoImageProcessor "`):
                The auto class to register this new image processor with.
        """
        if not isinstance(auto_class, str):
            auto_class = auto_class.__name__
        import transformers.models.auto as auto_module
        if not hasattr(auto_module, auto_class):
            raise ValueError(f"{auto_class} is not a valid auto class.")
        cls._auto_class = auto_class
    def fetch_images(self, image_url_or_urls: Union[str, List[str]]):
        """
        Convert a single or a list of urls into the corresponding `PIL.Image` objects.
        If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
        returned.
        """
        headers = {
            "User-Agent": (
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0"
                " Safari/537.36"
            )
        }
        if isinstance(image_url_or_urls, list):
            return [self.fetch_images(x) for x in image_url_or_urls]
        elif isinstance(image_url_or_urls, str):
            response = requests.get(image_url_or_urls, stream=True, headers=headers)
            response.raise_for_status()
            return Image.open(BytesIO(response.content))
        else:
            raise ValueError(f"only a single or a list of entries is supported but got type={type(image_url_or_urls)}")
 class BaseImageProcessor(ImageProcessingMixin):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
@@ -801,10 +280,3 @@ def select_best_resolution(original_size: tuple, possible_resolutions: list) ->
            best_fit = (height, width)
    return best_fit
 ImageProcessingMixin.push_to_hub = copy_func(ImageProcessingMixin.push_to_hub)
 if ImageProcessingMixin.push_to_hub.__doc__ is not None:
    ImageProcessingMixin.push_to_hub.__doc__ = ImageProcessingMixin.push_to_hub.__doc__.format(
        object="image processor", object_class="AutoImageProcessor", object_files="image processor file"
    )
--- a/src/transformers/image_processing_utils_fast.py
+++ b/src/transformers/image_processing_utils_fast.py
@@ -0,0 +1,63 @@
 # coding=utf-8
 # Copyright 2024 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import functools
 from dataclasses import dataclass
 from .image_processing_utils import BaseImageProcessor
 from .utils.import_utils import is_torchvision_available
 if is_torchvision_available():
    from torchvision.transforms import Compose
@dataclass(frozen=True)
 class SizeDict:
    """
    Hashable dictionary to store image size information.
    """
    height: int = None
    width: int = None
    longest_edge: int = None
    shortest_edge: int = None
    max_height: int = None
    max_width: int = None
    def __getitem__(self, key):
        if hasattr(self, key):
            return getattr(self, key)
        raise KeyError(f"Key {key} not found in SizeDict.")
 class BaseImageProcessorFast(BaseImageProcessor):
    _transform_params = None
    def _build_transforms(self, **kwargs) -> "Compose":
        """
        Given the input settings e.g. do_resize, build the image transforms.
        """
        raise NotImplementedError
    def _validate_params(self, **kwargs) -> None:
        for k, v in kwargs.items():
            if k not in self._transform_params:
                raise ValueError(f"Invalid transform parameter {k}={v}.")
    @functools.lru_cache(maxsize=1)
    def get_transforms(self, **kwargs) -> "Compose":
        self._validate_params(**kwargs)
        return self._build_transforms(**kwargs)
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -31,6 +31,7 @@ from .utils.import_utils import (
    is_flax_available,
    is_tf_available,
    is_torch_available,
    is_torchvision_available,
    is_vision_available,
    requires_backends,
 )
@@ -50,6 +51,9 @@ if is_tf_available():
 if is_flax_available():
    import jax.numpy as jnp
 if is_torchvision_available():
    from torchvision.transforms import functional as F
 def to_channel_dimension_format(
    image: np.ndarray,
@@ -374,6 +378,7 @@ def normalize(
    if input_data_format is None:
        input_data_format = infer_channel_dimension_format(image)
    channel_axis = get_channel_dimension_axis(image, input_data_format=input_data_format)
    num_channels = image.shape[channel_axis]
@@ -802,3 +807,48 @@ def flip_channel_order(
    if data_format is not None:
        image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
    return image
 def _cast_tensor_to_float(x):
    if x.is_floating_point():
        return x
    return x.float()
 class FusedRescaleNormalize:
    """
    Rescale and normalize the input image in one step.
    """
    def __init__(self, mean, std, rescale_factor: float = 1.0, inplace: bool = False):
        self.mean = torch.tensor(mean) * (1.0 / rescale_factor)
        self.std = torch.tensor(std) * (1.0 / rescale_factor)
        self.inplace = inplace
    def __call__(self, image: "torch.Tensor"):
        image = _cast_tensor_to_float(image)
        return F.normalize(image, self.mean, self.std, inplace=self.inplace)
 class Rescale:
    """
    Rescale the input image by rescale factor: image *= rescale_factor.
    """
    def __init__(self, rescale_factor: float = 1.0):
        self.rescale_factor = rescale_factor
    def __call__(self, image: "torch.Tensor"):
        image = image * self.rescale_factor
        return image
 class NumpyToTensor:
    """
    Convert a numpy array to a PyTorch tensor.
    """
    def __call__(self, image: np.ndarray):
        # Same as in PyTorch, we assume incoming numpy images are in HWC format
        # c.f. https://github.com/pytorch/vision/blob/61d97f41bc209e1407dcfbd685d2ee2da9c1cdad/torchvision/transforms/functional.py#L154
        return torch.from_numpy(image.transpose(2, 0, 1)).contiguous()
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -25,9 +25,11 @@ from packaging import version
 from .utils import (
    ExplicitEnum,
    is_jax_tensor,
    is_numpy_array,
    is_tf_tensor,
    is_torch_available,
    is_torch_tensor,
    is_torchvision_available,
    is_vision_available,
    logging,
    requires_backends,
@@ -52,6 +54,20 @@ if is_vision_available():
    else:
        PILImageResampling = PIL.Image
    if is_torchvision_available():
        from torchvision.transforms import InterpolationMode
        pil_torch_interpolation_mapping = {
            PILImageResampling.NEAREST: InterpolationMode.NEAREST,
            PILImageResampling.BOX: InterpolationMode.BOX,
            PILImageResampling.BILINEAR: InterpolationMode.BILINEAR,
            PILImageResampling.HAMMING: InterpolationMode.HAMMING,
            PILImageResampling.BICUBIC: InterpolationMode.BICUBIC,
            PILImageResampling.LANCZOS: InterpolationMode.LANCZOS,
            PILImageResampling.NEAREST: InterpolationMode.NEAREST,
        }
 if TYPE_CHECKING:
    if is_torch_available():
        import torch
@@ -90,14 +106,30 @@ def is_pil_image(img):
    return is_vision_available() and isinstance(img, PIL.Image.Image)
 class ImageType(ExplicitEnum):
    PIL = "pillow"
    TORCH = "torch"
    NUMPY = "numpy"
    TENSORFLOW = "tensorflow"
    JAX = "jax"
 def get_image_type(image):
    if is_pil_image(image):
        return ImageType.PIL
    if is_torch_tensor(image):
        return ImageType.TORCH
    if is_numpy_array(image):
        return ImageType.NUMPY
    if is_tf_tensor(image):
        return ImageType.TENSORFLOW
    if is_jax_tensor(image):
        return ImageType.JAX
    raise ValueError(f"Unrecognised image type {type(image)}")
 def is_valid_image(img):
-    return (
+    return is_pil_image(img) or is_numpy_array(img) or is_torch_tensor(img) or is_tf_tensor(img) or is_jax_tensor(img)
        (is_vision_available() and isinstance(img, PIL.Image.Image))
        or isinstance(img, np.ndarray)
        or is_torch_tensor(img)
        or is_tf_tensor(img)
        or is_jax_tensor(img)
    )
 def valid_images(imgs):
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -19,13 +19,21 @@ import json
 import os
 import warnings
 from collections import OrderedDict
-from typing import Dict, Optional, Union
+from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
 # Build the list of all image processors
 from ...configuration_utils import PretrainedConfig
 from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
-from ...image_processing_utils import ImageProcessingMixin
+from ...image_processing_utils import BaseImageProcessor, ImageProcessingMixin
-from ...utils import CONFIG_NAME, IMAGE_PROCESSOR_NAME, get_file_from_repo, logging
+from ...image_processing_utils_fast import BaseImageProcessorFast
 from ...utils import (
    CONFIG_NAME,
    IMAGE_PROCESSOR_NAME,
    get_file_from_repo,
    is_torchvision_available,
    is_vision_available,
    logging,
 )
 from .auto_factory import _LazyAutoMapping
 from .configuration_auto import (
    CONFIG_MAPPING_NAMES,
@@ -37,104 +45,125 @@ from .configuration_auto import (
 logger = logging.get_logger(__name__)
-IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
+
-    [
+if TYPE_CHECKING:
-        ("align", "EfficientNetImageProcessor"),
+    # This significantly improves completion suggestion performance when
-        ("beit", "BeitImageProcessor"),
+    # the transformers package is used with Microsoft's Pylance language server.
-        ("bit", "BitImageProcessor"),
+    IMAGE_PROCESSOR_MAPPING_NAMES: OrderedDict[str, Tuple[Optional[str], Optional[str]]] = OrderedDict()
-        ("blip", "BlipImageProcessor"),
+else:
-        ("blip-2", "BlipImageProcessor"),
+    IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
-        ("bridgetower", "BridgeTowerImageProcessor"),
+        [
-        ("chinese_clip", "ChineseCLIPImageProcessor"),
+            ("align", ("EfficientNetImageProcessor",)),
-        ("clip", "CLIPImageProcessor"),
+            ("beit", ("BeitImageProcessor",)),
-        ("clipseg", "ViTImageProcessor"),
+            ("bit", ("BitImageProcessor",)),
-        ("conditional_detr", "ConditionalDetrImageProcessor"),
+            ("blip", ("BlipImageProcessor",)),
-        ("convnext", "ConvNextImageProcessor"),
+            ("blip-2", ("BlipImageProcessor",)),
-        ("convnextv2", "ConvNextImageProcessor"),
+            ("bridgetower", ("BridgeTowerImageProcessor",)),
-        ("cvt", "ConvNextImageProcessor"),
+            ("chinese_clip", ("ChineseCLIPImageProcessor",)),
-        ("data2vec-vision", "BeitImageProcessor"),
+            ("clip", ("CLIPImageProcessor",)),
-        ("deformable_detr", "DeformableDetrImageProcessor"),
+            ("clipseg", ("ViTImageProcessor", "ViTImageProcessorFast")),
-        ("deit", "DeiTImageProcessor"),
+            ("conditional_detr", ("ConditionalDetrImageProcessor",)),
-        ("depth_anything", "DPTImageProcessor"),
+            ("convnext", ("ConvNextImageProcessor",)),
-        ("deta", "DetaImageProcessor"),
+            ("convnextv2", ("ConvNextImageProcessor",)),
-        ("detr", "DetrImageProcessor"),
+            ("cvt", ("ConvNextImageProcessor",)),
-        ("dinat", "ViTImageProcessor"),
+            ("data2vec-vision", ("BeitImageProcessor",)),
-        ("dinov2", "BitImageProcessor"),
+            ("deformable_detr", ("DeformableDetrImageProcessor",)),
-        ("donut-swin", "DonutImageProcessor"),
+            ("deit", ("DeiTImageProcessor",)),
-        ("dpt", "DPTImageProcessor"),
+            ("depth_anything", ("DPTImageProcessor",)),
-        ("efficientformer", "EfficientFormerImageProcessor"),
+            ("deta", ("DetaImageProcessor",)),
-        ("efficientnet", "EfficientNetImageProcessor"),
+            ("detr", ("DetrImageProcessor",)),
-        ("flava", "FlavaImageProcessor"),
+            ("dinat", ("ViTImageProcessor", "ViTImageProcessorFast")),
-        ("focalnet", "BitImageProcessor"),
+            ("dinov2", ("BitImageProcessor",)),
-        ("fuyu", "FuyuImageProcessor"),
+            ("donut-swin", ("DonutImageProcessor",)),
-        ("git", "CLIPImageProcessor"),
+            ("dpt", ("DPTImageProcessor",)),
-        ("glpn", "GLPNImageProcessor"),
+            ("efficientformer", ("EfficientFormerImageProcessor",)),
-        ("grounding-dino", "GroundingDinoImageProcessor"),
+            ("efficientnet", ("EfficientNetImageProcessor",)),
-        ("groupvit", "CLIPImageProcessor"),
+            ("flava", ("FlavaImageProcessor",)),
-        ("idefics", "IdeficsImageProcessor"),
+            ("focalnet", ("BitImageProcessor",)),
-        ("idefics2", "Idefics2ImageProcessor"),
+            ("fuyu", ("FuyuImageProcessor",)),
-        ("imagegpt", "ImageGPTImageProcessor"),
+            ("git", ("CLIPImageProcessor",)),
-        ("instructblip", "BlipImageProcessor"),
+            ("glpn", ("GLPNImageProcessor",)),
-        ("kosmos-2", "CLIPImageProcessor"),
+            ("grounding-dino", ("GroundingDinoImageProcessor",)),
-        ("layoutlmv2", "LayoutLMv2ImageProcessor"),
+            ("groupvit", ("CLIPImageProcessor",)),
-        ("layoutlmv3", "LayoutLMv3ImageProcessor"),
+            ("idefics", ("IdeficsImageProcessor",)),
-        ("levit", "LevitImageProcessor"),
+            ("idefics2", ("Idefics2ImageProcessor",)),
-        ("llava", "CLIPImageProcessor"),
+            ("imagegpt", ("ImageGPTImageProcessor",)),
-        ("llava_next", "LlavaNextImageProcessor"),
+            ("instructblip", ("BlipImageProcessor",)),
-        ("mask2former", "Mask2FormerImageProcessor"),
+            ("kosmos-2", ("CLIPImageProcessor",)),
-        ("maskformer", "MaskFormerImageProcessor"),
+            ("layoutlmv2", ("LayoutLMv2ImageProcessor",)),
-        ("mgp-str", "ViTImageProcessor"),
+            ("layoutlmv3", ("LayoutLMv3ImageProcessor",)),
-        ("mobilenet_v1", "MobileNetV1ImageProcessor"),
+            ("levit", ("LevitImageProcessor",)),
-        ("mobilenet_v2", "MobileNetV2ImageProcessor"),
+            ("llava", ("CLIPImageProcessor",)),
-        ("mobilevit", "MobileViTImageProcessor"),
+            ("llava_next", ("LlavaNextImageProcessor",)),
-        ("mobilevit", "MobileViTImageProcessor"),
+            ("mask2former", ("Mask2FormerImageProcessor",)),
-        ("mobilevitv2", "MobileViTImageProcessor"),
+            ("maskformer", ("MaskFormerImageProcessor",)),
-        ("nat", "ViTImageProcessor"),
+            ("mgp-str", ("ViTImageProcessor", "ViTImageProcessorFast")),
-        ("nougat", "NougatImageProcessor"),
+            ("mobilenet_v1", ("MobileNetV1ImageProcessor",)),
-        ("oneformer", "OneFormerImageProcessor"),
+            ("mobilenet_v2", ("MobileNetV2ImageProcessor",)),
-        ("owlv2", "Owlv2ImageProcessor"),
+            ("mobilevit", ("MobileViTImageProcessor",)),
-        ("owlvit", "OwlViTImageProcessor"),
+            ("mobilevit", ("MobileViTImageProcessor",)),
-        ("paligemma", "CLIPImageProcessor"),
+            ("mobilevitv2", ("MobileViTImageProcessor",)),
-        ("perceiver", "PerceiverImageProcessor"),
+            ("nat", ("ViTImageProcessor", "ViTImageProcessorFast")),
-        ("pix2struct", "Pix2StructImageProcessor"),
+            ("nougat", ("NougatImageProcessor",)),
-        ("poolformer", "PoolFormerImageProcessor"),
+            ("oneformer", ("OneFormerImageProcessor",)),
-        ("pvt", "PvtImageProcessor"),
+            ("owlv2", ("Owlv2ImageProcessor",)),
-        ("pvt_v2", "PvtImageProcessor"),
+            ("owlvit", ("OwlViTImageProcessor",)),
-        ("regnet", "ConvNextImageProcessor"),
+            ("perceiver", ("PerceiverImageProcessor",)),
-        ("resnet", "ConvNextImageProcessor"),
+            ("pix2struct", ("Pix2StructImageProcessor",)),
-        ("sam", "SamImageProcessor"),
+            ("poolformer", ("PoolFormerImageProcessor",)),
-        ("segformer", "SegformerImageProcessor"),
+            ("pvt", ("PvtImageProcessor",)),
-        ("seggpt", "SegGptImageProcessor"),
+            ("pvt_v2", ("PvtImageProcessor",)),
-        ("siglip", "SiglipImageProcessor"),
+            ("regnet", ("ConvNextImageProcessor",)),
-        ("swiftformer", "ViTImageProcessor"),
+            ("resnet", ("ConvNextImageProcessor",)),
-        ("swin", "ViTImageProcessor"),
+            ("sam", ("SamImageProcessor",)),
-        ("swin2sr", "Swin2SRImageProcessor"),
+            ("segformer", ("SegformerImageProcessor",)),
-        ("swinv2", "ViTImageProcessor"),
+            ("seggpt", ("SegGptImageProcessor",)),
-        ("table-transformer", "DetrImageProcessor"),
+            ("siglip", ("SiglipImageProcessor",)),
-        ("timesformer", "VideoMAEImageProcessor"),
+            ("swiftformer", ("ViTImageProcessor", "ViTImageProcessorFast")),
-        ("tvlt", "TvltImageProcessor"),
+            ("swin", ("ViTImageProcessor", "ViTImageProcessorFast")),
-        ("tvp", "TvpImageProcessor"),
+            ("swin2sr", ("Swin2SRImageProcessor",)),
-        ("udop", "LayoutLMv3ImageProcessor"),
+            ("swinv2", ("ViTImageProcessor", "ViTImageProcessorFast")),
-        ("upernet", "SegformerImageProcessor"),
+            ("table-transformer", ("DetrImageProcessor",)),
-        ("van", "ConvNextImageProcessor"),
+            ("timesformer", ("VideoMAEImageProcessor",)),
-        ("video_llava", "VideoLlavaImageProcessor"),
+            ("tvlt", ("TvltImageProcessor",)),
-        ("videomae", "VideoMAEImageProcessor"),
+            ("tvp", ("TvpImageProcessor",)),
-        ("vilt", "ViltImageProcessor"),
+            ("udop", ("LayoutLMv3ImageProcessor",)),
-        ("vipllava", "CLIPImageProcessor"),
+            ("upernet", ("SegformerImageProcessor",)),
-        ("vit", "ViTImageProcessor"),
+            ("van", ("ConvNextImageProcessor",)),
-        ("vit_hybrid", "ViTHybridImageProcessor"),
+            ("videomae", ("VideoMAEImageProcessor",)),
-        ("vit_mae", "ViTImageProcessor"),
+            ("vilt", ("ViltImageProcessor",)),
-        ("vit_msn", "ViTImageProcessor"),
+            ("vipllava", ("CLIPImageProcessor",)),
-        ("vitmatte", "VitMatteImageProcessor"),
+            ("vit", ("ViTImageProcessor", "ViTImageProcessorFast")),
-        ("xclip", "CLIPImageProcessor"),
+            ("vit_hybrid", ("ViTHybridImageProcessor",)),
-        ("yolos", "YolosImageProcessor"),
+            ("vit_mae", ("ViTImageProcessor", "ViTImageProcessorFast")),
-    ]
+            ("vit_msn", ("ViTImageProcessor", "ViTImageProcessorFast")),
-)
+            ("vitmatte", ("VitMatteImageProcessor",)),
            ("xclip", ("CLIPImageProcessor",)),
            ("yolos", ("YolosImageProcessor",)),
        ]
    )
 for model_type, image_processors in IMAGE_PROCESSOR_MAPPING_NAMES.items():
    slow_image_processor_class, *fast_image_processor_class = image_processors
    if not is_vision_available():
        slow_image_processor_class = None
    # If the fast image processor is not defined, or torchvision is not available, we set it to None
    if not fast_image_processor_class or fast_image_processor_class[0] is None or not is_torchvision_available():
        fast_image_processor_class = None
    else:
        fast_image_processor_class = fast_image_processor_class[0]
    IMAGE_PROCESSOR_MAPPING_NAMES[model_type] = (slow_image_processor_class, fast_image_processor_class)
 IMAGE_PROCESSOR_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, IMAGE_PROCESSOR_MAPPING_NAMES)
 def image_processor_class_from_name(class_name: str):
    if class_name == "BaseImageProcessorFast":
        return BaseImageProcessorFast
    for module_name, extractors in IMAGE_PROCESSOR_MAPPING_NAMES.items():
        if class_name in extractors:
            module_name = model_type_to_module_name(module_name)
@@ -145,11 +174,12 @@ def image_processor_class_from_name(class_name: str):
            except AttributeError:
                continue
-    for _, extractor in IMAGE_PROCESSOR_MAPPING._extra_content.items():
+    for _, extractors in IMAGE_PROCESSOR_MAPPING._extra_content.items():
-        if getattr(extractor, "__name__", None) == class_name:
+        for extractor in extractors:
-            return extractor
+            if getattr(extractor, "__name__", None) == class_name:
                return extractor
-    # We did not fine the class, but maybe it's because a dep is missing. In that case, the class will be in the main
+    # We did not find the class, but maybe it's because a dep is missing. In that case, the class will be in the main
    # init and we return the proper dummy to get an appropriate error message.
    main_module = importlib.import_module("transformers")
    if hasattr(main_module, class_name):
@@ -258,6 +288,13 @@ def get_image_processor_config(
        return json.load(reader)
 def _warning_fast_image_processor_available(fast_class):
    logger.warning(
        f"Fast image processor class {fast_class} is available for this model. "
        "Using slow image processor class. To use the fast image processor class set `use_fast=True`."
    )
 class AutoImageProcessor:
    r"""
    This is a generic image processor class that will be instantiated as one of the image processor classes of the
@@ -274,7 +311,7 @@ class AutoImageProcessor:
    @classmethod
    @replace_list_option_in_docstrings(IMAGE_PROCESSOR_MAPPING_NAMES)
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
        r"""
        Instantiate one of the image processor classes of the library from a pretrained model vocabulary.
@@ -314,6 +351,10 @@ class AutoImageProcessor:
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                identifier allowed by git.
            use_fast (`bool`, *optional*, defaults to `False`):
                Use a fast torchvision-base image processor if it is supported for a given model.
                If a fast tokenizer is not available for a given model, a normal numpy-based image processor
                is returned instead.
            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
                If `False`, then this function returns just the final image processor object. If `True`, then this
                functions returns a `Tuple(image_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
@@ -358,6 +399,7 @@ class AutoImageProcessor:
            kwargs["token"] = use_auth_token
        config = kwargs.pop("config", None)
        use_fast = kwargs.pop("use_fast", False)
        trust_remote_code = kwargs.pop("trust_remote_code", None)
        kwargs["_from_auto"] = True
@@ -387,6 +429,11 @@ class AutoImageProcessor:
                image_processor_auto_map = config.auto_map["AutoImageProcessor"]
        if image_processor_class is not None:
            # Update class name to reflect the use_fast option. If class is not found, None is returned.
            if use_fast and not image_processor_class.endswith("Fast"):
                image_processor_class += "Fast"
            elif not use_fast and image_processor_class.endswith("Fast"):
                image_processor_class = image_processor_class[:-4]
            image_processor_class = image_processor_class_from_name(image_processor_class)
        has_remote_code = image_processor_auto_map is not None
@@ -395,10 +442,19 @@ class AutoImageProcessor:
            trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
        )
        if image_processor_auto_map is not None and not isinstance(image_processor_auto_map, tuple):
            # In some configs, only the slow image processor class is stored
            image_processor_auto_map = (image_processor_auto_map, None)
        if has_remote_code and trust_remote_code:
-            image_processor_class = get_class_from_dynamic_module(
+            if not use_fast and image_processor_auto_map[1] is not None:
-                image_processor_auto_map, pretrained_model_name_or_path, **kwargs
+                _warning_fast_image_processor_available(image_processor_auto_map[1])
-            )
+
            if use_fast and image_processor_auto_map[1] is not None:
                class_ref = image_processor_auto_map[1]
            else:
                class_ref = image_processor_auto_map[0]
            image_processor_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
            _ = kwargs.pop("code_revision", None)
            if os.path.isdir(pretrained_model_name_or_path):
                image_processor_class.register_for_auto_class()
@@ -407,8 +463,22 @@ class AutoImageProcessor:
            return image_processor_class.from_dict(config_dict, **kwargs)
        # Last try: we use the IMAGE_PROCESSOR_MAPPING.
        elif type(config) in IMAGE_PROCESSOR_MAPPING:
-            image_processor_class = IMAGE_PROCESSOR_MAPPING[type(config)]
+            image_processor_tuple = IMAGE_PROCESSOR_MAPPING[type(config)]
-            return image_processor_class.from_dict(config_dict, **kwargs)
+
            image_processor_class_py, image_processor_class_fast = image_processor_tuple
            if not use_fast and image_processor_class_fast is not None:
                _warning_fast_image_processor_available(image_processor_class_fast)
            if image_processor_class_fast and (use_fast or image_processor_class_py is None):
                return image_processor_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
            else:
                if image_processor_class_py is not None:
                    return image_processor_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
                else:
                    raise ValueError(
                        "This image processor cannot be instantiated. Please make sure you have `Pillow` installed."
                    )
        raise ValueError(
            f"Unrecognized image processor in {pretrained_model_name_or_path}. Should have a "
@@ -417,7 +487,13 @@ class AutoImageProcessor:
        )
    @staticmethod
-    def register(config_class, image_processor_class, exist_ok=False):
+    def register(
        config_class,
        image_processor_class=None,
        slow_image_processor_class=None,
        fast_image_processor_class=None,
        exist_ok=False,
    ):
        """
        Register a new image processor for this class.
@@ -426,4 +502,43 @@ class AutoImageProcessor:
                The configuration corresponding to the model to register.
            image_processor_class ([`ImageProcessingMixin`]): The image processor to register.
        """
-        IMAGE_PROCESSOR_MAPPING.register(config_class, image_processor_class, exist_ok=exist_ok)
+        if image_processor_class is not None:
            if slow_image_processor_class is not None:
                raise ValueError("Cannot specify both image_processor_class and slow_image_processor_class")
            warnings.warn(
                "The image_processor_class argument is deprecated and will be removed in v4.42. Please use `slow_image_processor_class`, or `fast_image_processor_class` instead",
                FutureWarning,
            )
            slow_image_processor_class = image_processor_class
        if slow_image_processor_class is None and fast_image_processor_class is None:
            raise ValueError("You need to specify either slow_image_processor_class or fast_image_processor_class")
        if slow_image_processor_class is not None and issubclass(slow_image_processor_class, BaseImageProcessorFast):
            raise ValueError("You passed a fast image processor in as the `slow_image_processor_class`.")
        if fast_image_processor_class is not None and issubclass(fast_image_processor_class, BaseImageProcessor):
            raise ValueError("You passed a slow image processor in as the `fast_image_processor_class`.")
        if (
            slow_image_processor_class is not None
            and fast_image_processor_class is not None
            and issubclass(fast_image_processor_class, BaseImageProcessorFast)
            and fast_image_processor_class.slow_image_processor_class != slow_image_processor_class
        ):
            raise ValueError(
                "The fast processor class you are passing has a `slow_image_processor_class` attribute that is not "
                "consistent with the slow processor class you passed (fast tokenizer has "
                f"{fast_image_processor_class.slow_image_processor_class} and you passed {slow_image_processor_class}. Fix one of those "
                "so they match!"
            )
        # Avoid resetting a set slow/fast image processor if we are passing just the other ones.
        if config_class in IMAGE_PROCESSOR_MAPPING._extra_content:
            existing_slow, existing_fast = IMAGE_PROCESSOR_MAPPING[config_class]
            if slow_image_processor_class is None:
                slow_image_processor_class = existing_slow
            if fast_image_processor_class is None:
                fast_image_processor_class = existing_fast
        IMAGE_PROCESSOR_MAPPING.register(
            config_class, (slow_image_processor_class, fast_image_processor_class), exist_ok=exist_ok
        )
--- a/src/transformers/models/vit/init.py
+++ b/src/transformers/models/vit/init.py
@@ -19,6 +19,7 @@ from ...utils import (
    is_flax_available,
    is_tf_available,
    is_torch_available,
    is_torchvision_available,
    is_vision_available,
 )
@@ -34,6 +35,15 @@ else:
    _import_structure["feature_extraction_vit"] = ["ViTFeatureExtractor"]
    _import_structure["image_processing_vit"] = ["ViTImageProcessor"]
 try:
    if not is_torchvision_available():
        raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
    pass
 else:
    _import_structure["image_processing_vit_fast"] = ["ViTImageProcessorFast"]
 try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
@@ -83,6 +93,14 @@ if TYPE_CHECKING:
        from .feature_extraction_vit import ViTFeatureExtractor
        from .image_processing_vit import ViTImageProcessor
    try:
        if not is_torchvision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .image_processing_vit_fast import ViTImageProcessorFast
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
--- a/src/transformers/models/vit/image_processing_vit_fast.py
+++ b/src/transformers/models/vit/image_processing_vit_fast.py
@@ -0,0 +1,289 @@
 # coding=utf-8
 # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Fast Image processor class for ViT."""
 import functools
 from typing import Dict, List, Optional, Union
 from ...image_processing_base import BatchFeature
 from ...image_processing_utils import get_size_dict
 from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict
 from ...image_transforms import FusedRescaleNormalize, NumpyToTensor, Rescale
 from ...image_utils import (
    IMAGENET_STANDARD_MEAN,
    IMAGENET_STANDARD_STD,
    ChannelDimension,
    ImageInput,
    ImageType,
    PILImageResampling,
    get_image_type,
    make_list_of_images,
    pil_torch_interpolation_mapping,
 )
 from ...utils import TensorType, logging
 from ...utils.import_utils import is_torch_available, is_torchvision_available
 logger = logging.get_logger(__name__)
 if is_torch_available():
    import torch
 if is_torchvision_available():
    from torchvision.transforms import Compose, Normalize, PILToTensor, Resize
 class ViTImageProcessorFast(BaseImageProcessorFast):
    r"""
    Constructs a ViT image processor.
    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
            size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
        size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
            method.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
            `preprocess` method.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
            parameter in the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
            `preprocess` method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
    """
    model_input_names = ["pixel_values"]
    _transform_params = [
        "do_resize",
        "do_rescale",
        "do_normalize",
        "size",
        "resample",
        "rescale_factor",
        "image_mean",
        "image_std",
        "image_type",
    ]
    def __init__(
        self,
        do_resize: bool = True,
        size: Optional[Dict[str, int]] = None,
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1 / 255,
        do_normalize: bool = True,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        size = size if size is not None else {"height": 224, "width": 224}
        size = get_size_dict(size)
        self.do_resize = do_resize
        self.do_rescale = do_rescale
        self.do_normalize = do_normalize
        self.size = size
        self.resample = resample
        self.rescale_factor = rescale_factor
        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
        self._transform_settings = {}
    def _build_transforms(
        self,
        do_resize: bool,
        size: Dict[str, int],
        resample: PILImageResampling,
        do_rescale: bool,
        rescale_factor: float,
        do_normalize: bool,
        image_mean: Union[float, List[float]],
        image_std: Union[float, List[float]],
        image_type: ImageType,
    ) -> "Compose":
        """
        Given the input settings build the image transforms using `torchvision.transforms.Compose`.
        """
        transforms = []
        # All PIL and numpy values need to be converted to a torch tensor
        # to keep cross compatibility with slow image processors
        if image_type == ImageType.PIL:
            transforms.append(PILToTensor())
        elif image_type == ImageType.NUMPY:
            transforms.append(NumpyToTensor())
        if do_resize:
            transforms.append(
                Resize((size["height"], size["width"]), interpolation=pil_torch_interpolation_mapping[resample])
            )
        # We can combine rescale and normalize into a single operation for speed
        if do_rescale and do_normalize:
            transforms.append(FusedRescaleNormalize(image_mean, image_std, rescale_factor=rescale_factor))
        elif do_rescale:
            transforms.append(Rescale(rescale_factor=rescale_factor))
        elif do_normalize:
            transforms.append(Normalize(image_mean, image_std))
        return Compose(transforms)
    @functools.lru_cache(maxsize=1)
    def _validate_input_arguments(
        self,
        return_tensors: Union[str, TensorType],
        do_resize: bool,
        size: Dict[str, int],
        resample: PILImageResampling,
        do_rescale: bool,
        rescale_factor: float,
        do_normalize: bool,
        image_mean: Union[float, List[float]],
        image_std: Union[float, List[float]],
        data_format: Union[str, ChannelDimension],
        image_type: ImageType,
    ):
        if return_tensors != "pt":
            raise ValueError("Only returning PyTorch tensors is currently supported.")
        if data_format != ChannelDimension.FIRST:
            raise ValueError("Only channel first data format is currently supported.")
        if do_resize and None in (size, resample):
            raise ValueError("Size and resample must be specified if do_resize is True.")
        if do_rescale and rescale_factor is None:
            raise ValueError("Rescale factor must be specified if do_rescale is True.")
        if do_normalize and None in (image_mean, image_std):
            raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.")
    def preprocess(
        self,
        images: ImageInput,
        do_resize: Optional[bool] = None,
        size: Dict[str, int] = None,
        resample: PILImageResampling = None,
        do_rescale: Optional[bool] = None,
        rescale_factor: Optional[float] = None,
        do_normalize: Optional[bool] = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        return_tensors: Optional[Union[str, TensorType]] = "pt",
        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ):
        """
        Preprocess an image or batch of images.
        Args:
            images (`ImageInput`):
                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
                Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
                resizing.
            resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
                `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
                an effect if `do_resize` is set to `True`.
            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                Whether to rescale the image values between [0 - 1].
            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether to normalize the image.
            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
                Image mean to use if `do_normalize` is set to `True`.
            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
                Image standard deviation to use if `do_normalize` is set to `True`.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Only "pt" is supported
            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format for the output image. The following formats are currently supported:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        """
        do_resize = do_resize if do_resize is not None else self.do_resize
        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
        resample = resample if resample is not None else self.resample
        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
        image_mean = image_mean if image_mean is not None else self.image_mean
        image_std = image_std if image_std is not None else self.image_std
        size = size if size is not None else self.size
        # Make hashable for cache
        size = SizeDict(**size)
        image_mean = tuple(image_mean) if isinstance(image_mean, list) else image_mean
        image_std = tuple(image_std) if isinstance(image_std, list) else image_std
        images = make_list_of_images(images)
        image_type = get_image_type(images[0])
        if image_type not in [ImageType.PIL, ImageType.TORCH, ImageType.NUMPY]:
            raise ValueError(f"Unsupported input image type {image_type}")
        self._validate_input_arguments(
            do_resize=do_resize,
            size=size,
            resample=resample,
            do_rescale=do_rescale,
            rescale_factor=rescale_factor,
            do_normalize=do_normalize,
            image_mean=image_mean,
            image_std=image_std,
            return_tensors=return_tensors,
            data_format=data_format,
            image_type=image_type,
        )
        transforms = self.get_transforms(
            do_resize=do_resize,
            do_rescale=do_rescale,
            do_normalize=do_normalize,
            size=size,
            resample=resample,
            rescale_factor=rescale_factor,
            image_mean=image_mean,
            image_std=image_std,
            image_type=image_type,
        )
        transformed_images = [transforms(image) for image in images]
        data = {"pixel_values": torch.vstack(transformed_images)}
        return BatchFeature(data, tensor_type=return_tensors)
--- a/src/transformers/utils/dummy_torchvision_objects.py
+++ b/src/transformers/utils/dummy_torchvision_objects.py
@@ -0,0 +1,16 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
 from ..utils import DummyObject, requires_backends
 class BaseImageProcessorFast(metaclass=DummyObject):
    _backends = ["torchvision"]
    def __init__(self, *args, **kwargs):
        requires_backends(self, ["torchvision"])
 class ViTImageProcessorFast(metaclass=DummyObject):
    _backends = ["torchvision"]
    def __init__(self, *args, **kwargs):
        requires_backends(self, ["torchvision"])
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -9,6 +9,13 @@ class ImageProcessingMixin(metaclass=DummyObject):
        requires_backends(self, ["vision"])
 class BaseImageProcessor(metaclass=DummyObject):
    _backends = ["vision"]
    def __init__(self, *args, **kwargs):
        requires_backends(self, ["vision"])
 class ImageFeatureExtractionMixin(metaclass=DummyObject):
    _backends = ["vision"]
--- a/tests/models/auto/test_image_processing_auto.py
+++ b/tests/models/auto/test_image_processing_auto.py
@@ -27,8 +27,10 @@ from transformers import (
    AutoImageProcessor,
    CLIPConfig,
    CLIPImageProcessor,
    ViTImageProcessor,
    ViTImageProcessorFast,
 )
-from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER
+from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, require_torchvision, require_vision
 sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
@@ -133,6 +135,23 @@ class AutoImageProcessorTest(unittest.TestCase):
        ):
            _ = AutoImageProcessor.from_pretrained("hf-internal-testing/config-no-model")
    @require_vision
    @require_torchvision
    def test_use_fast_selection(self):
        checkpoint = "hf-internal-testing/tiny-random-vit"
        # Slow image processor is selected by default
        image_processor = AutoImageProcessor.from_pretrained(checkpoint)
        self.assertIsInstance(image_processor, ViTImageProcessor)
        # Fast image processor is selected when use_fast=True
        image_processor = AutoImageProcessor.from_pretrained(checkpoint, use_fast=True)
        self.assertIsInstance(image_processor, ViTImageProcessorFast)
        # Slow image processor is selected when use_fast=False
        image_processor = AutoImageProcessor.from_pretrained(checkpoint, use_fast=False)
        self.assertIsInstance(image_processor, ViTImageProcessor)
    def test_from_pretrained_dynamic_image_processor(self):
        # If remote code is not set, we will time out when asking whether to load the model.
        with self.assertRaises(ValueError):
--- a/tests/models/beit/test_image_processing_beit.py
+++ b/tests/models/beit/test_image_processing_beit.py
@@ -121,6 +121,7 @@ class BeitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = BeitImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = BeitImageProcessingTester(self)
    @property
--- a/tests/models/blip/test_image_processing_blip.py
+++ b/tests/models/blip/test_image_processing_blip.py
@@ -90,6 +90,7 @@ class BlipImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = BlipImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = BlipImageProcessingTester(self)
    @property
@@ -112,6 +113,7 @@ class BlipImageProcessingTestFourChannels(ImageProcessingTestMixin, unittest.Tes
    image_processing_class = BlipImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = BlipImageProcessingTester(self, num_channels=4)
        self.expected_encoded_image_num_channels = 3
--- a/tests/models/bridgetower/test_image_processing_bridgetower.py
+++ b/tests/models/bridgetower/test_image_processing_bridgetower.py
@@ -136,6 +136,7 @@ class BridgeTowerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
    image_processing_class = BridgeTowerImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = BridgeTowerImageProcessingTester(self)
    @property
--- a/tests/models/chinese_clip/test_image_processing_chinese_clip.py
+++ b/tests/models/chinese_clip/test_image_processing_chinese_clip.py
@@ -98,6 +98,7 @@ class ChineseCLIPImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
    image_processing_class = ChineseCLIPImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = ChineseCLIPImageProcessingTester(self, do_center_crop=True)
    @property
@@ -135,6 +136,7 @@ class ChineseCLIPImageProcessingTestFourChannels(ImageProcessingTestMixin, unitt
    image_processing_class = ChineseCLIPImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = ChineseCLIPImageProcessingTester(self, num_channels=4, do_center_crop=True)
        self.expected_encoded_image_num_channels = 3
--- a/tests/models/clip/test_image_processing_clip.py
+++ b/tests/models/clip/test_image_processing_clip.py
@@ -94,6 +94,7 @@ class CLIPImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = CLIPImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = CLIPImageProcessingTester(self)
    @property
--- a/tests/models/conditional_detr/test_image_processing_conditional_detr.py
+++ b/tests/models/conditional_detr/test_image_processing_conditional_detr.py
@@ -131,6 +131,7 @@ class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcess
    image_processing_class = ConditionalDetrImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = ConditionalDetrImageProcessingTester(self)
    @property
--- a/tests/models/convnext/test_image_processing_convnext.py
+++ b/tests/models/convnext/test_image_processing_convnext.py
@@ -87,6 +87,7 @@ class ConvNextImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = ConvNextImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = ConvNextImageProcessingTester(self)
    @property
--- a/tests/models/deformable_detr/test_image_processing_deformable_detr.py
+++ b/tests/models/deformable_detr/test_image_processing_deformable_detr.py
@@ -131,6 +131,7 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
    image_processing_class = DeformableDetrImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = DeformableDetrImageProcessingTester(self)
    @property
--- a/tests/models/deit/test_image_processing_deit.py
+++ b/tests/models/deit/test_image_processing_deit.py
@@ -93,6 +93,7 @@ class DeiTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    test_cast_dtype = True
    def setUp(self):
        super().setUp()
        self.image_processor_tester = DeiTImageProcessingTester(self)
    @property
--- a/tests/models/detr/test_image_processing_detr.py
+++ b/tests/models/detr/test_image_processing_detr.py
@@ -130,6 +130,7 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
    image_processing_class = DetrImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = DetrImageProcessingTester(self)
    @property
--- a/tests/models/donut/test_image_processing_donut.py
+++ b/tests/models/donut/test_image_processing_donut.py
@@ -99,6 +99,7 @@ class DonutImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = DonutImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = DonutImageProcessingTester(self)
    @property
--- a/tests/models/dpt/test_image_processing_dpt.py
+++ b/tests/models/dpt/test_image_processing_dpt.py
@@ -86,6 +86,7 @@ class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = DPTImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = DPTImageProcessingTester(self)
    @property
--- a/tests/models/efficientnet/test_image_processing_efficientnet.py
+++ b/tests/models/efficientnet/test_image_processing_efficientnet.py
@@ -86,6 +86,7 @@ class EfficientNetImageProcessorTest(ImageProcessingTestMixin, unittest.TestCase
    image_processing_class = EfficientNetImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = EfficientNetImageProcessorTester(self)
    @property
--- a/tests/models/flava/test_image_processing_flava.py
+++ b/tests/models/flava/test_image_processing_flava.py
@@ -175,6 +175,7 @@ class FlavaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    maxDiff = None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = FlavaImageProcessingTester(self)
    @property
--- a/tests/models/glpn/test_image_processing_glpn.py
+++ b/tests/models/glpn/test_image_processing_glpn.py
@@ -93,6 +93,7 @@ class GLPNImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = GLPNImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = GLPNImageProcessingTester(self)
    @property
--- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py
+++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
@@ -146,6 +146,7 @@ class GroundingDinoImageProcessingTest(AnnotationFormatTestMixin, ImageProcessin
    image_processing_class = GroundingDinoImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = GroundingDinoImageProcessingTester(self)
    @property
--- a/tests/models/idefics/test_image_processing_idefics.py
+++ b/tests/models/idefics/test_image_processing_idefics.py
@@ -127,6 +127,7 @@ class IdeficsImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = IdeficsImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = IdeficsImageProcessingTester(self)
    @property
--- a/tests/models/idefics2/test_image_processing_idefics2.py
+++ b/tests/models/idefics2/test_image_processing_idefics2.py
@@ -185,6 +185,7 @@ class Idefics2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = Idefics2ImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = Idefics2ImageProcessingTester(self)
    @property
--- a/tests/models/imagegpt/test_image_processing_imagegpt.py
+++ b/tests/models/imagegpt/test_image_processing_imagegpt.py
@@ -22,7 +22,8 @@ import unittest
 import numpy as np
 from datasets import load_dataset
-from transformers.testing_utils import require_torch, require_vision, slow
+from transformers import AutoImageProcessor
 from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_vision, slow
 from transformers.utils import is_torch_available, is_vision_available
 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
@@ -96,6 +97,7 @@ class ImageGPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = ImageGPTImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = ImageGPTImageProcessingTester(self)
    @property
@@ -141,18 +143,38 @@ class ImageGPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
                self.assertEqual(image_processor_first[key], value)
    def test_image_processor_from_and_save_pretrained(self):
-        image_processor_first = self.image_processing_class(**self.image_processor_dict)
+        for image_processing_class in self.image_processor_list:
            image_processor_first = self.image_processing_class(**self.image_processor_dict)
-        with tempfile.TemporaryDirectory() as tmpdirname:
+            with tempfile.TemporaryDirectory() as tmpdirname:
-            image_processor_first.save_pretrained(tmpdirname)
+                image_processor_first.save_pretrained(tmpdirname)
-            image_processor_second = self.image_processing_class.from_pretrained(tmpdirname).to_dict()
+                image_processor_second = self.image_processing_class.from_pretrained(tmpdirname).to_dict()
-        image_processor_first = image_processor_first.to_dict()
+            image_processor_first = image_processor_first.to_dict()
-        for key, value in image_processor_first.items():
+            for key, value in image_processor_first.items():
-            if key == "clusters":
+                if key == "clusters":
-                self.assertTrue(np.array_equal(value, image_processor_second[key]))
+                    self.assertTrue(np.array_equal(value, image_processor_second[key]))
-            else:
+                else:
-                self.assertEqual(image_processor_first[key], value)
+                    self.assertEqual(image_processor_first[key], value)
    def test_image_processor_save_load_with_autoimageprocessor(self):
        for image_processing_class in self.image_processor_list:
            image_processor_first = image_processing_class(**self.image_processor_dict)
            with tempfile.TemporaryDirectory() as tmpdirname:
                saved_file = image_processor_first.save_pretrained(tmpdirname)[0]
                check_json_file_has_correct_format(saved_file)
                image_processor_second = AutoImageProcessor.from_pretrained(tmpdirname)
            image_processor_first = image_processor_first.to_dict()
            image_processor_second = image_processor_second.to_dict()
            for key, value in image_processor_first.items():
                if key == "clusters":
                    self.assertTrue(np.array_equal(value, image_processor_second[key]))
                else:
                    self.assertEqual(image_processor_first[key], value)
    @unittest.skip("ImageGPT requires clusters at initialization")
    def test_init_without_params(self):
--- a/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py
@@ -76,6 +76,7 @@ class LayoutLMv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
    image_processing_class = LayoutLMv2ImageProcessor if is_pytesseract_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = LayoutLMv2ImageProcessingTester(self)
    @property
--- a/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py
@@ -76,6 +76,7 @@ class LayoutLMv3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
    image_processing_class = LayoutLMv3ImageProcessor if is_pytesseract_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = LayoutLMv3ImageProcessingTester(self)
    @property
--- a/tests/models/levit/test_image_processing_levit.py
+++ b/tests/models/levit/test_image_processing_levit.py
@@ -91,6 +91,7 @@ class LevitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = LevitImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = LevitImageProcessingTester(self)
    @property
--- a/tests/models/llava_next/test_image_processor_llava_next.py
+++ b/tests/models/llava_next/test_image_processor_llava_next.py
@@ -105,6 +105,7 @@ class LlavaNextImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->LlavaNext
    def setUp(self):
        super().setUp()
        self.image_processor_tester = LlavaNextImageProcessingTester(self)
    @property
--- a/tests/models/mask2former/test_image_processing_mask2former.py
+++ b/tests/models/mask2former/test_image_processing_mask2former.py
@@ -149,6 +149,7 @@ class Mask2FormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
    image_processing_class = Mask2FormerImageProcessor if (is_vision_available() and is_torch_available()) else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = Mask2FormerImageProcessingTester(self)
    @property
--- a/tests/models/maskformer/test_image_processing_maskformer.py
+++ b/tests/models/maskformer/test_image_processing_maskformer.py
@@ -149,6 +149,7 @@ class MaskFormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
    image_processing_class = MaskFormerImageProcessor if (is_vision_available() and is_torch_available()) else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = MaskFormerImageProcessingTester(self)
    @property
--- a/tests/models/mobilenet_v1/test_image_processing_mobilenet_v1.py
+++ b/tests/models/mobilenet_v1/test_image_processing_mobilenet_v1.py
@@ -82,6 +82,7 @@ class MobileNetV1ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
    image_processing_class = MobileNetV1ImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = MobileNetV1ImageProcessingTester(self)
    @property
--- a/tests/models/mobilenet_v2/test_image_processing_mobilenet_v2.py
+++ b/tests/models/mobilenet_v2/test_image_processing_mobilenet_v2.py
@@ -82,6 +82,7 @@ class MobileNetV2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
    image_processing_class = MobileNetV2ImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = MobileNetV2ImageProcessingTester(self)
    @property
--- a/tests/models/mobilevit/test_image_processing_mobilevit.py
+++ b/tests/models/mobilevit/test_image_processing_mobilevit.py
@@ -112,6 +112,7 @@ class MobileViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = MobileViTImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = MobileViTImageProcessingTester(self)
    @property
--- a/tests/models/nougat/test_image_processing_nougat.py
+++ b/tests/models/nougat/test_image_processing_nougat.py
@@ -111,6 +111,7 @@ class NougatImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = NougatImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = NougatImageProcessingTester(self)
    @property
--- a/tests/models/oneformer/test_image_processing_oneformer.py
+++ b/tests/models/oneformer/test_image_processing_oneformer.py
@@ -159,6 +159,7 @@ class OneFormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = image_processing_class
    def setUp(self):
        super().setUp()
        self.image_processor_tester = OneFormerImageProcessorTester(self)
    @property
--- a/tests/models/owlv2/test_image_processor_owlv2.py
+++ b/tests/models/owlv2/test_image_processor_owlv2.py
@@ -90,6 +90,7 @@ class Owlv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = Owlv2ImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = Owlv2ImageProcessingTester(self)
    @property
--- a/tests/models/owlvit/test_image_processing_owlvit.py
+++ b/tests/models/owlvit/test_image_processing_owlvit.py
@@ -92,6 +92,7 @@ class OwlViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = OwlViTImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = OwlViTImageProcessingTester(self)
    @property
--- a/tests/models/pix2struct/test_image_processing_pix2struct.py
+++ b/tests/models/pix2struct/test_image_processing_pix2struct.py
@@ -87,6 +87,7 @@ class Pix2StructImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
    image_processing_class = Pix2StructImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = Pix2StructImageProcessingTester(self)
    @property
@@ -288,6 +289,7 @@ class Pix2StructImageProcessingTestFourChannels(ImageProcessingTestMixin, unitte
    image_processing_class = Pix2StructImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = Pix2StructImageProcessingTester(self, num_channels=4)
        self.expected_encoded_image_num_channels = 3
--- a/tests/models/poolformer/test_image_processing_poolformer.py
+++ b/tests/models/poolformer/test_image_processing_poolformer.py
@@ -88,6 +88,7 @@ class PoolFormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
    image_processing_class = PoolFormerImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = PoolFormerImageProcessingTester(self)
    @property
--- a/tests/models/pvt/test_image_processing_pvt.py
+++ b/tests/models/pvt/test_image_processing_pvt.py
@@ -84,6 +84,7 @@ class PvtImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = PvtImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = PvtImageProcessingTester(self)
    @property
--- a/tests/models/segformer/test_image_processing_segformer.py
+++ b/tests/models/segformer/test_image_processing_segformer.py
@@ -112,6 +112,7 @@ class SegformerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = SegformerImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = SegformerImageProcessingTester(self)
    @property
--- a/tests/models/seggpt/test_image_processing_seggpt.py
+++ b/tests/models/seggpt/test_image_processing_seggpt.py
@@ -114,6 +114,7 @@ class SegGptImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = SegGptImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = SegGptImageProcessingTester(self)
    @property
--- a/tests/models/siglip/test_image_processor_siglip.py
+++ b/tests/models/siglip/test_image_processor_siglip.py
@@ -91,6 +91,7 @@ class SiglipImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = SiglipImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = SiglipImageProcessingTester(self)
    @property
--- a/tests/models/superpoint/test_image_processing_superpoint.py
+++ b/tests/models/superpoint/test_image_processing_superpoint.py
@@ -77,6 +77,7 @@ class SuperPointImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
    image_processing_class = SuperPointImageProcessor if is_vision_available() else None
    def setUp(self) -> None:
        super().setUp()
        self.image_processor_tester = SuperPointImageProcessingTester(self)
    @property
--- a/tests/models/swin2sr/test_image_processing_swin2sr.py
+++ b/tests/models/swin2sr/test_image_processing_swin2sr.py
@@ -98,6 +98,7 @@ class Swin2SRImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = Swin2SRImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = Swin2SRImageProcessingTester(self)
    @property
--- a/tests/models/tvp/test_image_processing_tvp.py
+++ b/tests/models/tvp/test_image_processing_tvp.py
@@ -127,6 +127,7 @@ class TvpImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = TvpImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = TvpImageProcessingTester(self)
    @property
--- a/tests/models/video_llava/test_image_processing_video_llava.py
+++ b/tests/models/video_llava/test_image_processing_video_llava.py
@@ -128,6 +128,7 @@ class VideoLlavaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->VideoLlava
    def setUp(self):
        super().setUp()
        self.image_processor_tester = VideoLlavaImageProcessingTester(self)
    @property
--- a/tests/models/videomae/test_image_processing_videomae.py
+++ b/tests/models/videomae/test_image_processing_videomae.py
@@ -99,6 +99,7 @@ class VideoMAEImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = VideoMAEImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = VideoMAEImageProcessingTester(self)
    @property
--- a/tests/models/vilt/test_image_processing_vilt.py
+++ b/tests/models/vilt/test_image_processing_vilt.py
@@ -130,6 +130,7 @@ class ViltImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = ViltImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = ViltImageProcessingTester(self)
    @property
--- a/tests/models/vit/test_image_processing_vit.py
+++ b/tests/models/vit/test_image_processing_vit.py
@@ -84,6 +84,7 @@ class ViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = ViTImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = ViTImageProcessingTester(self)
    @property
@@ -91,16 +92,18 @@ class ViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        return self.image_processor_tester.prepare_image_processor_dict()
    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
+        for image_processing_class in self.image_processor_list:
-        self.assertTrue(hasattr(image_processing, "image_mean"))
+            image_processing = image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "image_std"))
+            self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
+            self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_resize"))
+            self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "size"))
+            self.assertTrue(hasattr(image_processing, "do_resize"))
            self.assertTrue(hasattr(image_processing, "size"))
    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        for image_processing_class in self.image_processor_list:
-        self.assertEqual(image_processor.size, {"height": 18, "width": 18})
+            image_processor = image_processing_class.from_dict(self.image_processor_dict)
            self.assertEqual(image_processor.size, {"height": 18, "width": 18})
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
+            image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42)
-        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+            self.assertEqual(image_processor.size, {"height": 42, "width": 42})
--- a/tests/models/vitmatte/test_image_processing_vitmatte.py
+++ b/tests/models/vitmatte/test_image_processing_vitmatte.py
@@ -94,6 +94,7 @@ class VitMatteImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = VitMatteImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = VitMatteImageProcessingTester(self)
    @property
--- a/tests/models/vivit/test_image_processing_vivit.py
+++ b/tests/models/vivit/test_image_processing_vivit.py
@@ -99,6 +99,7 @@ class VivitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = VivitImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = VivitImageProcessingTester(self)
    @property
--- a/tests/models/yolos/test_image_processing_yolos.py
+++ b/tests/models/yolos/test_image_processing_yolos.py
@@ -143,6 +143,7 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
    image_processing_class = YolosImageProcessor if is_vision_available() else None
    def setUp(self):
        super().setUp()
        self.image_processor_tester = YolosImageProcessingTester(self)
    @property
--- a/tests/test_image_processing_common.py
+++ b/tests/test_image_processing_common.py
@@ -19,7 +19,9 @@ import os
 import pathlib
 import tempfile
-from transformers import BatchFeature
+import requests
 from transformers import AutoImageProcessor, BatchFeature
 from transformers.image_utils import AnnotationFormat, AnnotionFormat
 from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available
@@ -129,176 +131,263 @@ def prepare_video_inputs(
 class ImageProcessingTestMixin:
    test_cast_dtype = None
    image_processing_class = None
    fast_image_processing_class = None
    image_processors_list = None
    test_slow_image_processor = True
    test_fast_image_processor = True
    def setUp(self):
        image_processor_list = []
        if self.test_slow_image_processor and self.image_processing_class:
            image_processor_list.append(self.image_processing_class)
        if self.test_fast_image_processor and self.fast_image_processing_class:
            image_processor_list.append(self.fast_image_processing_class)
        self.image_processor_list = image_processor_list
    @require_vision
    @require_torch
    def test_slow_fast_equivalence(self):
        dummy_image = Image.open(
            requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw
        )
        if not self.test_slow_image_processor or not self.test_fast_image_processor:
            self.skipTest("Skipping slow/fast equivalence test")
        if self.image_processing_class is None or self.fast_image_processing_class is None:
            self.skipTest("Skipping slow/fast equivalence test as one of the image processors is not defined")
        image_processor_slow = self.image_processing_class(**self.image_processor_dict)
        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
        encoding_slow = image_processor_slow(dummy_image, return_tensors="pt")
        encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
        self.assertTrue(torch.allclose(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-3))
    @require_vision
    @require_torch
    def test_fast_is_faster_than_slow(self):
        import time
        def measure_time(self, image_processor, dummy_image):
            start = time.time()
            _ = image_processor(dummy_image, return_tensors="pt")
            return time.time() - start
        dummy_image = Image.open(
            requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw
        )
        if not self.test_slow_image_processor or not self.test_fast_image_processor:
            self.skipTest("Skipping speed test")
        if self.image_processing_class is None or self.fast_image_processing_class is None:
            self.skipTest("Skipping speed test as one of the image processors is not defined")
        image_processor_slow = self.image_processing_class(**self.image_processor_dict)
        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
        slow_time = self.measure_time(image_processor_slow, dummy_image)
        fast_time = self.measure_time(image_processor_fast, dummy_image)
        self.assertLessEqual(fast_time, slow_time)
    def test_image_processor_to_json_string(self):
-        image_processor = self.image_processing_class(**self.image_processor_dict)
+        for image_processing_class in self.image_processor_list:
-        obj = json.loads(image_processor.to_json_string())
+            image_processor = image_processing_class(**self.image_processor_dict)
-        for key, value in self.image_processor_dict.items():
+            obj = json.loads(image_processor.to_json_string())
-            self.assertEqual(obj[key], value)
+            for key, value in self.image_processor_dict.items():
                self.assertEqual(obj[key], value)
    def test_image_processor_to_json_file(self):
-        image_processor_first = self.image_processing_class(**self.image_processor_dict)
+        for image_processing_class in self.image_processor_list:
            image_processor_first = image_processing_class(**self.image_processor_dict)
-        with tempfile.TemporaryDirectory() as tmpdirname:
+            with tempfile.TemporaryDirectory() as tmpdirname:
-            json_file_path = os.path.join(tmpdirname, "image_processor.json")
+                json_file_path = os.path.join(tmpdirname, "image_processor.json")
-            image_processor_first.to_json_file(json_file_path)
+                image_processor_first.to_json_file(json_file_path)
-            image_processor_second = self.image_processing_class.from_json_file(json_file_path)
+                image_processor_second = image_processing_class.from_json_file(json_file_path)
-        self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
+            self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
    def test_image_processor_from_and_save_pretrained(self):
-        image_processor_first = self.image_processing_class(**self.image_processor_dict)
+        for image_processing_class in self.image_processor_list:
            image_processor_first = image_processing_class(**self.image_processor_dict)
-        with tempfile.TemporaryDirectory() as tmpdirname:
+            with tempfile.TemporaryDirectory() as tmpdirname:
-            saved_file = image_processor_first.save_pretrained(tmpdirname)[0]
+                saved_file = image_processor_first.save_pretrained(tmpdirname)[0]
-            check_json_file_has_correct_format(saved_file)
+                check_json_file_has_correct_format(saved_file)
-            image_processor_second = self.image_processing_class.from_pretrained(tmpdirname)
+                image_processor_second = image_processing_class.from_pretrained(tmpdirname)
-        self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
+            self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
    def test_image_processor_save_load_with_autoimageprocessor(self):
        for image_processing_class in self.image_processor_list:
            image_processor_first = image_processing_class(**self.image_processor_dict)
            with tempfile.TemporaryDirectory() as tmpdirname:
                saved_file = image_processor_first.save_pretrained(tmpdirname)[0]
                check_json_file_has_correct_format(saved_file)
                image_processor_second = AutoImageProcessor.from_pretrained(tmpdirname)
            self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
    def test_init_without_params(self):
-        image_processor = self.image_processing_class()
+        for image_processing_class in self.image_processor_list:
-        self.assertIsNotNone(image_processor)
+            image_processor = image_processing_class()
            self.assertIsNotNone(image_processor)
    @require_torch
    @require_vision
    def test_cast_dtype_device(self):
-        if self.test_cast_dtype is not None:
+        for image_processing_class in self.image_processor_list:
-            # Initialize image_processor
+            if self.test_cast_dtype is not None:
-            image_processor = self.image_processing_class(**self.image_processor_dict)
+                # Initialize image_processor
                image_processor = image_processing_class(**self.image_processor_dict)
                # create random PyTorch tensors
                image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
                encoding = image_processor(image_inputs, return_tensors="pt")
                # for layoutLM compatiblity
                self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
                self.assertEqual(encoding.pixel_values.dtype, torch.float32)
                encoding = image_processor(image_inputs, return_tensors="pt").to(torch.float16)
                self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
                self.assertEqual(encoding.pixel_values.dtype, torch.float16)
                encoding = image_processor(image_inputs, return_tensors="pt").to("cpu", torch.bfloat16)
                self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
                self.assertEqual(encoding.pixel_values.dtype, torch.bfloat16)
                with self.assertRaises(TypeError):
                    _ = image_processor(image_inputs, return_tensors="pt").to(torch.bfloat16, "cpu")
                # Try with text + image feature
                encoding = image_processor(image_inputs, return_tensors="pt")
                encoding.update({"input_ids": torch.LongTensor([[1, 2, 3], [4, 5, 6]])})
                encoding = encoding.to(torch.float16)
                self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
                self.assertEqual(encoding.pixel_values.dtype, torch.float16)
                self.assertEqual(encoding.input_ids.dtype, torch.long)
    def test_call_pil(self):
        for image_processing_class in self.image_processor_list:
            # Initialize image_processing
            image_processing = image_processing_class(**self.image_processor_dict)
            # create random PIL images
            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
            for image in image_inputs:
                self.assertIsInstance(image, Image.Image)
            # Test not batched input
            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
            self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
            # Test batched
            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
            self.assertEqual(
                tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
            )
    def test_call_numpy(self):
        for image_processing_class in self.image_processor_list:
            # Initialize image_processing
            image_processing = image_processing_class(**self.image_processor_dict)
            # create random numpy tensors
            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
            for image in image_inputs:
                self.assertIsInstance(image, np.ndarray)
            # Test not batched input
            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
            self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
            # Test batched
            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
            self.assertEqual(
                tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
            )
    def test_call_pytorch(self):
        for image_processing_class in self.image_processor_list:
            # Initialize image_processing
            image_processing = image_processing_class(**self.image_processor_dict)
            # create random PyTorch tensors
            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
-            encoding = image_processor(image_inputs, return_tensors="pt")
+            for image in image_inputs:
-            # for layoutLM compatiblity
+                self.assertIsInstance(image, torch.Tensor)
            self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
            self.assertEqual(encoding.pixel_values.dtype, torch.float32)
-            encoding = image_processor(image_inputs, return_tensors="pt").to(torch.float16)
+            # Test not batched input
-            self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
-            self.assertEqual(encoding.pixel_values.dtype, torch.float16)
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
            self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
-            encoding = image_processor(image_inputs, return_tensors="pt").to("cpu", torch.bfloat16)
+            # Test batched
-            self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
-            self.assertEqual(encoding.pixel_values.dtype, torch.bfloat16)
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
-
+            self.assertEqual(
-            with self.assertRaises(TypeError):
+                tuple(encoded_images.shape),
-                _ = image_processor(image_inputs, return_tensors="pt").to(torch.bfloat16, "cpu")
+                (self.image_processor_tester.batch_size, *expected_output_image_shape),
-
+            )
            # Try with text + image feature
            encoding = image_processor(image_inputs, return_tensors="pt")
            encoding.update({"input_ids": torch.LongTensor([[1, 2, 3], [4, 5, 6]])})
            encoding = encoding.to(torch.float16)
            self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
            self.assertEqual(encoding.pixel_values.dtype, torch.float16)
            self.assertEqual(encoding.input_ids.dtype, torch.long)
    def test_call_pil(self):
        # Initialize image_processing
        image_processing = self.image_processing_class(**self.image_processor_dict)
        # create random PIL images
        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
        for image in image_inputs:
            self.assertIsInstance(image, Image.Image)
        # Test not batched input
        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
        # Test batched
        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
        self.assertEqual(
            tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
        )
    def test_call_numpy(self):
        # Initialize image_processing
        image_processing = self.image_processing_class(**self.image_processor_dict)
        # create random numpy tensors
        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
        for image in image_inputs:
            self.assertIsInstance(image, np.ndarray)
        # Test not batched input
        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
        # Test batched
        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
        self.assertEqual(
            tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
        )
    def test_call_pytorch(self):
        # Initialize image_processing
        image_processing = self.image_processing_class(**self.image_processor_dict)
        # create random PyTorch tensors
        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
        for image in image_inputs:
            self.assertIsInstance(image, torch.Tensor)
        # Test not batched input
        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
        # Test batched
        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
        self.assertEqual(
            tuple(encoded_images.shape),
            (self.image_processor_tester.batch_size, *expected_output_image_shape),
        )
    def test_call_numpy_4_channels(self):
-        # Test that can process images which have an arbitrary number of channels
+        for image_processing_class in self.image_processor_list:
-        # Initialize image_processing
+            # Test that can process images which have an arbitrary number of channels
-        image_processor = self.image_processing_class(**self.image_processor_dict)
+            # Initialize image_processing
            image_processor = image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
+            # create random numpy tensors
-        self.image_processor_tester.num_channels = 4
+            self.image_processor_tester.num_channels = 4
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
-        # Test not batched input
+            # Test not batched input
-        encoded_images = image_processor(
+            encoded_images = image_processor(
-            image_inputs[0],
+                image_inputs[0],
-            return_tensors="pt",
+                return_tensors="pt",
-            input_data_format="channels_first",
+                input_data_format="channels_first",
-            image_mean=0,
+                image_mean=0,
-            image_std=1,
+                image_std=1,
-        ).pixel_values
+            ).pixel_values
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
-        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+            self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
-        # Test batched
+            # Test batched
-        encoded_images = image_processor(
+            encoded_images = image_processor(
-            image_inputs,
+                image_inputs,
-            return_tensors="pt",
+                return_tensors="pt",
-            input_data_format="channels_first",
+                input_data_format="channels_first",
-            image_mean=0,
+                image_mean=0,
-            image_std=1,
+                image_std=1,
-        ).pixel_values
+            ).pixel_values
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
-        self.assertEqual(
+            self.assertEqual(
-            tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
+                tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
-        )
+            )
    def test_image_processor_preprocess_arguments(self):
-        image_processor = self.image_processing_class(**self.image_processor_dict)
+        for image_processing_class in self.image_processor_list:
-        if hasattr(image_processor, "_valid_processor_keys") and hasattr(image_processor, "preprocess"):
+            image_processor = image_processing_class(**self.image_processor_dict)
-            preprocess_parameter_names = inspect.getfullargspec(image_processor.preprocess).args
+            if hasattr(image_processor, "_valid_processor_keys") and hasattr(image_processor, "preprocess"):
-            preprocess_parameter_names.remove("self")
+                preprocess_parameter_names = inspect.getfullargspec(image_processor.preprocess).args
-            preprocess_parameter_names.sort()
+                preprocess_parameter_names.remove("self")
-            valid_processor_keys = image_processor._valid_processor_keys
+                preprocess_parameter_names.sort()
-            valid_processor_keys.sort()
+                valid_processor_keys = image_processor._valid_processor_keys
-            self.assertEqual(preprocess_parameter_names, valid_processor_keys)
+                valid_processor_keys.sort()
                self.assertEqual(preprocess_parameter_names, valid_processor_keys)
 class AnnotationFormatTestMixin: