From f53fe35b29c06307b44781db8c65c7f94f5eeadd Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Tue, 11 Jun 2024 15:47:38 +0100
Subject: [PATCH] Fast image processor (#28847)

* Draft fast image processors

* Draft working fast version

* py3.8 compatible cache

* Enable loading fast image processors through auto

* Tidy up; rescale behaviour based on input type

* Enable tests for fast image processors

* Smarter rescaling

* Don't default to Fast

* Safer imports

* Add necessary Pillow requirement

* Woops

* Add AutoImageProcessor test

* Fix up

* Fix test for imagegpt

* Fix test

* Review comments

* Add warning for TF and JAX input types

* Rearrange

* Return transforms

* NumpyToTensor transformation

* Rebase - include changes from upstream in ImageProcessingMixin

* Safe typing

* Fix up

* convert mean/std to tesnor to rescale

* Don't store transforms in state

* Fix up

* Update src/transformers/image_processing_utils_fast.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* Update src/transformers/models/auto/image_processing_auto.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* Update src/transformers/models/auto/image_processing_auto.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* Update src/transformers/models/auto/image_processing_auto.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* Warn if fast image processor available

* Update src/transformers/models/vit/image_processing_vit_fast.py

* Transpose incoming numpy images to be in CHW format

* Update mapping names based on packages, auto set fast to None

* Fix up

* Fix

* Add AutoImageProcessor.from_pretrained(checkpoint, use_fast=True) test

* Update src/transformers/models/vit/image_processing_vit_fast.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>

* Add equivalence and speed tests

* Fix up

---------

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 .../source/en/main_classes/image_processor.md |   5 +
 docs/source/en/model_doc/vit.md               |   7 +-
 examples/pytorch/_tests_requirements.txt      |   1 +
 src/transformers/__init__.py                  |  27 +-
 src/transformers/image_processing_base.py     | 554 ++++++++++++++++++
 src/transformers/image_processing_utils.py    | 534 +----------------
 .../image_processing_utils_fast.py            |  63 ++
 src/transformers/image_transforms.py          |  50 ++
 src/transformers/image_utils.py               |  46 +-
 .../models/auto/image_processing_auto.py      | 331 +++++++----
 src/transformers/models/vit/__init__.py       |  18 +
 .../models/vit/image_processing_vit_fast.py   | 289 +++++++++
 .../utils/dummy_torchvision_objects.py        |  16 +
 .../utils/dummy_vision_objects.py             |   7 +
 .../models/auto/test_image_processing_auto.py |  21 +-
 .../models/beit/test_image_processing_beit.py |   1 +
 .../models/blip/test_image_processing_blip.py |   2 +
 .../test_image_processing_bridgetower.py      |   1 +
 .../test_image_processing_chinese_clip.py     |   2 +
 .../models/clip/test_image_processing_clip.py |   1 +
 .../test_image_processing_conditional_detr.py |   1 +
 .../test_image_processing_convnext.py         |   1 +
 .../test_image_processing_deformable_detr.py  |   1 +
 .../models/deit/test_image_processing_deit.py |   1 +
 .../models/detr/test_image_processing_detr.py |   1 +
 .../donut/test_image_processing_donut.py      |   1 +
 tests/models/dpt/test_image_processing_dpt.py |   1 +
 .../test_image_processing_efficientnet.py     |   1 +
 .../flava/test_image_processing_flava.py      |   1 +
 .../models/glpn/test_image_processing_glpn.py |   1 +
 .../test_image_processing_grounding_dino.py   |   1 +
 .../idefics/test_image_processing_idefics.py  |   1 +
 .../test_image_processing_idefics2.py         |   1 +
 .../test_image_processing_imagegpt.py         |  44 +-
 .../test_image_processing_layoutlmv2.py       |   1 +
 .../test_image_processing_layoutlmv3.py       |   1 +
 .../levit/test_image_processing_levit.py      |   1 +
 .../test_image_processor_llava_next.py        |   1 +
 .../test_image_processing_mask2former.py      |   1 +
 .../test_image_processing_maskformer.py       |   1 +
 .../test_image_processing_mobilenet_v1.py     |   1 +
 .../test_image_processing_mobilenet_v2.py     |   1 +
 .../test_image_processing_mobilevit.py        |   1 +
 .../nougat/test_image_processing_nougat.py    |   1 +
 .../test_image_processing_oneformer.py        |   1 +
 .../owlv2/test_image_processor_owlv2.py       |   1 +
 .../owlvit/test_image_processing_owlvit.py    |   1 +
 .../test_image_processing_pix2struct.py       |   2 +
 .../test_image_processing_poolformer.py       |   1 +
 tests/models/pvt/test_image_processing_pvt.py |   1 +
 .../test_image_processing_segformer.py        |   1 +
 .../seggpt/test_image_processing_seggpt.py    |   1 +
 .../siglip/test_image_processor_siglip.py     |   1 +
 .../test_image_processing_superpoint.py       |   1 +
 .../swin2sr/test_image_processing_swin2sr.py  |   1 +
 tests/models/tvp/test_image_processing_tvp.py |   1 +
 .../test_image_processing_video_llava.py      |   1 +
 .../test_image_processing_videomae.py         |   1 +
 .../models/vilt/test_image_processing_vilt.py |   1 +
 tests/models/vit/test_image_processing_vit.py |  23 +-
 .../test_image_processing_vitmatte.py         |   1 +
 .../vivit/test_image_processing_vivit.py      |   1 +
 .../yolos/test_image_processing_yolos.py      |   1 +
 tests/test_image_processing_common.py         | 373 +++++++-----
 64 files changed, 1645 insertions(+), 813 deletions(-)
 create mode 100644 src/transformers/image_processing_base.py
 create mode 100644 src/transformers/image_processing_utils_fast.py
 create mode 100644 src/transformers/models/vit/image_processing_vit_fast.py
 create mode 100644 src/transformers/utils/dummy_torchvision_objects.py

diff --git a/docs/source/en/main_classes/image_processor.md b/docs/source/en/main_classes/image_processor.md
index 04a3cd1337..59a78e6821 100644
--- a/docs/source/en/main_classes/image_processor.md
+++ b/docs/source/en/main_classes/image_processor.md
@@ -32,3 +32,8 @@ An image processor is in charge of preparing input features for vision models an
 ## BaseImageProcessor
 
 [[autodoc]] image_processing_utils.BaseImageProcessor
+
+
+## BaseImageProcessorFast
+
+[[autodoc]] image_processing_utils_fast.BaseImageProcessorFast
diff --git a/docs/source/en/model_doc/vit.md b/docs/source/en/model_doc/vit.md
index b49cb82185..53a550895c 100644
--- a/docs/source/en/model_doc/vit.md
+++ b/docs/source/en/model_doc/vit.md
@@ -62,7 +62,7 @@ Following the original Vision Transformer, some follow-up works have been made:
 This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code (written in JAX) can be
 found [here](https://github.com/google-research/vision_transformer).
 
-Note that we converted the weights from Ross Wightman's [timm library](https://github.com/rwightman/pytorch-image-models), 
+Note that we converted the weights from Ross Wightman's [timm library](https://github.com/rwightman/pytorch-image-models),
 who already converted the weights from JAX to PyTorch. Credits go to him!
 
 ## Usage tips
@@ -158,6 +158,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] ViTImageProcessor
     - preprocess
 
+## ViTImageProcessorFast
+
+[[autodoc]] ViTImageProcessorFast
+    - preprocess
+
 <frameworkcontent>
 <pt>
 
diff --git a/examples/pytorch/_tests_requirements.txt b/examples/pytorch/_tests_requirements.txt
index 2a854b12e6..819b49c799 100644
--- a/examples/pytorch/_tests_requirements.txt
+++ b/examples/pytorch/_tests_requirements.txt
@@ -29,3 +29,4 @@ timm
 albumentations >= 1.4.5
 torchmetrics
 pycocotools
+Pillow>=10.0.1,<=15.0
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 973764da0b..4976a4a1b9 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1104,7 +1104,8 @@ except OptionalDependencyNotAvailable:
         name for name in dir(dummy_vision_objects) if not name.startswith("_")
     ]
 else:
-    _import_structure["image_processing_utils"] = ["ImageProcessingMixin"]
+    _import_structure["image_processing_base"] = ["ImageProcessingMixin"]
+    _import_structure["image_processing_utils"] = ["BaseImageProcessor"]
     _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
     _import_structure["models.beit"].extend(["BeitFeatureExtractor", "BeitImageProcessor"])
     _import_structure["models.bit"].extend(["BitImageProcessor"])
@@ -1167,6 +1168,18 @@ else:
     _import_structure["models.vivit"].append("VivitImageProcessor")
     _import_structure["models.yolos"].extend(["YolosFeatureExtractor", "YolosImageProcessor"])
 
+try:
+    if not is_torchvision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_torchvision_objects
+
+    _import_structure["utils.dummy_torchvision_objects"] = [
+        name for name in dir(dummy_torchvision_objects) if not name.startswith("_")
+    ]
+else:
+    _import_structure["image_processing_utils_fast"] = ["BaseImageProcessorFast"]
+    _import_structure["models.vit"].append("ViTImageProcessorFast")
 
 # PyTorch-backed objects
 try:
@@ -5703,7 +5716,8 @@ if TYPE_CHECKING:
     except OptionalDependencyNotAvailable:
         from .utils.dummy_vision_objects import *
     else:
-        from .image_processing_utils import ImageProcessingMixin
+        from .image_processing_base import ImageProcessingMixin
+        from .image_processing_utils import BaseImageProcessor
         from .image_utils import ImageFeatureExtractionMixin
         from .models.beit import BeitFeatureExtractor, BeitImageProcessor
         from .models.bit import BitImageProcessor
@@ -5793,6 +5807,15 @@ if TYPE_CHECKING:
         from .models.vivit import VivitImageProcessor
         from .models.yolos import YolosFeatureExtractor, YolosImageProcessor
 
+    try:
+        if not is_torchvision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torchvision_objects import *
+    else:
+        from .image_processing_utils_fast import BaseImageProcessorFast
+        from .models.vit import ViTImageProcessorFast
+
     # Modeling
     try:
         if not is_torch_available():
diff --git a/src/transformers/image_processing_base.py b/src/transformers/image_processing_base.py
new file mode 100644
index 0000000000..6c80aee016
--- /dev/null
+++ b/src/transformers/image_processing_base.py
@@ -0,0 +1,554 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import copy
+import json
+import os
+import warnings
+from io import BytesIO
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import requests
+
+from .dynamic_module_utils import custom_object_save
+from .feature_extraction_utils import BatchFeature as BaseBatchFeature
+from .utils import (
+    IMAGE_PROCESSOR_NAME,
+    PushToHubMixin,
+    add_model_info_to_auto_map,
+    add_model_info_to_custom_pipelines,
+    cached_file,
+    copy_func,
+    download_url,
+    is_offline_mode,
+    is_remote_url,
+    is_vision_available,
+    logging,
+)
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+logger = logging.get_logger(__name__)
+
+
+# TODO: Move BatchFeature to be imported by both image_processing_utils and image_processing_utils
+# We override the class string here, but logic is the same.
+class BatchFeature(BaseBatchFeature):
+    r"""
+    Holds the output of the image processor specific `__call__` methods.
+
+    This class is derived from a python dictionary and can be used as a dictionary.
+
+    Args:
+        data (`dict`):
+            Dictionary of lists/arrays/tensors returned by the __call__ method ('pixel_values', etc.).
+        tensor_type (`Union[None, str, TensorType]`, *optional*):
+            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
+            initialization.
+    """
+
+
+# TODO: (Amy) - factor out the common parts of this and the feature extractor
+class ImageProcessingMixin(PushToHubMixin):
+    """
+    This is an image processor mixin used to provide saving/loading functionality for sequential and image feature
+    extractors.
+    """
+
+    _auto_class = None
+
+    def __init__(self, **kwargs):
+        """Set elements of `kwargs` as attributes."""
+        # This key was saved while we still used `XXXFeatureExtractor` for image processing. Now we use
+        # `XXXImageProcessor`, this attribute and its value are misleading.
+        kwargs.pop("feature_extractor_type", None)
+        # Pop "processor_class" as it should be saved as private attribute
+        self._processor_class = kwargs.pop("processor_class", None)
+        # Additional attributes without default values
+        for key, value in kwargs.items():
+            try:
+                setattr(self, key, value)
+            except AttributeError as err:
+                logger.error(f"Can't set {key} with value {value} for {self}")
+                raise err
+
+    def _set_processor_class(self, processor_class: str):
+        """Sets processor class as an attribute."""
+        self._processor_class = processor_class
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        **kwargs,
+    ):
+        r"""
+        Instantiate a type of [`~image_processing_utils.ImageProcessingMixin`] from an image processor.
+
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                This can be either:
+
+                - a string, the *model id* of a pretrained image_processor hosted inside a model repo on
+                  huggingface.co.
+                - a path to a *directory* containing a image processor file saved using the
+                  [`~image_processing_utils.ImageProcessingMixin.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+                - a path or url to a saved image processor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
+            cache_dir (`str` or `os.PathLike`, *optional*):
+                Path to a directory in which a downloaded pretrained model image processor should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force to (re-)download the image processor files and override the cached versions if
+                they exist.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible.
+                Will be removed in v5 of Transformers.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+            token (`str` or `bool`, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
+                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+
+
+                <Tip>
+
+                To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".
+
+                </Tip>
+
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                If `False`, then this function returns just the final image processor object. If `True`, then this
+                functions returns a `Tuple(image_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
+                consisting of the key/value pairs whose keys are not image processor attributes: i.e., the part of
+                `kwargs` which has not been used to update `image_processor` and is otherwise ignored.
+            subfolder (`str`, *optional*, defaults to `""`):
+                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
+                specify the folder name here.
+            kwargs (`Dict[str, Any]`, *optional*):
+                The values in kwargs of any keys which are image processor attributes will be used to override the
+                loaded values. Behavior concerning key/value pairs whose keys are *not* image processor attributes is
+                controlled by the `return_unused_kwargs` keyword parameter.
+
+        Returns:
+            A image processor of type [`~image_processing_utils.ImageProcessingMixin`].
+
+        Examples:
+
+        ```python
+        # We can't instantiate directly the base class *ImageProcessingMixin* so let's show the examples on a
+        # derived class: *CLIPImageProcessor*
+        image_processor = CLIPImageProcessor.from_pretrained(
+            "openai/clip-vit-base-patch32"
+        )  # Download image_processing_config from huggingface.co and cache.
+        image_processor = CLIPImageProcessor.from_pretrained(
+            "./test/saved_model/"
+        )  # E.g. image processor (or model) was saved using *save_pretrained('./test/saved_model/')*
+        image_processor = CLIPImageProcessor.from_pretrained("./test/saved_model/preprocessor_config.json")
+        image_processor = CLIPImageProcessor.from_pretrained(
+            "openai/clip-vit-base-patch32", do_normalize=False, foo=False
+        )
+        assert image_processor.do_normalize is False
+        image_processor, unused_kwargs = CLIPImageProcessor.from_pretrained(
+            "openai/clip-vit-base-patch32", do_normalize=False, foo=False, return_unused_kwargs=True
+        )
+        assert image_processor.do_normalize is False
+        assert unused_kwargs == {"foo": False}
+        ```"""
+        kwargs["cache_dir"] = cache_dir
+        kwargs["force_download"] = force_download
+        kwargs["local_files_only"] = local_files_only
+        kwargs["revision"] = revision
+
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+                FutureWarning,
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        if token is not None:
+            kwargs["token"] = token
+
+        image_processor_dict, kwargs = cls.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
+
+        return cls.from_dict(image_processor_dict, **kwargs)
+
+    def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
+        """
+        Save an image processor object to the directory `save_directory`, so that it can be re-loaded using the
+        [`~image_processing_utils.ImageProcessingMixin.from_pretrained`] class method.
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the image processor JSON file will be saved (will be created if it does not exist).
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        use_auth_token = kwargs.pop("use_auth_token", None)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+                FutureWarning,
+            )
+            if kwargs.get("token", None) is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = self._create_repo(repo_id, **kwargs)
+            files_timestamps = self._get_files_timestamps(save_directory)
+
+        # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
+        # loaded from the Hub.
+        if self._auto_class is not None:
+            custom_object_save(self, save_directory, config=self)
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_image_processor_file = os.path.join(save_directory, IMAGE_PROCESSOR_NAME)
+
+        self.to_json_file(output_image_processor_file)
+        logger.info(f"Image processor saved in {output_image_processor_file}")
+
+        if push_to_hub:
+            self._upload_modified_files(
+                save_directory,
+                repo_id,
+                files_timestamps,
+                commit_message=commit_message,
+                token=kwargs.get("token"),
+            )
+
+        return [output_image_processor_file]
+
+    @classmethod
+    def get_image_processor_dict(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        """
+        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
+        image processor of type [`~image_processor_utils.ImageProcessingMixin`] using `from_dict`.
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
+            subfolder (`str`, *optional*, defaults to `""`):
+                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
+                specify the folder name here.
+
+        Returns:
+            `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object.
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", None)
+        proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", "")
+
+        from_pipeline = kwargs.pop("_from_pipeline", None)
+        from_auto_class = kwargs.pop("_from_auto", False)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+                FutureWarning,
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        user_agent = {"file_type": "image processor", "from_auto_class": from_auto_class}
+        if from_pipeline is not None:
+            user_agent["using_pipeline"] = from_pipeline
+
+        if is_offline_mode() and not local_files_only:
+            logger.info("Offline mode: forcing local_files_only=True")
+            local_files_only = True
+
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        is_local = os.path.isdir(pretrained_model_name_or_path)
+        if os.path.isdir(pretrained_model_name_or_path):
+            image_processor_file = os.path.join(pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME)
+        if os.path.isfile(pretrained_model_name_or_path):
+            resolved_image_processor_file = pretrained_model_name_or_path
+            is_local = True
+        elif is_remote_url(pretrained_model_name_or_path):
+            image_processor_file = pretrained_model_name_or_path
+            resolved_image_processor_file = download_url(pretrained_model_name_or_path)
+        else:
+            image_processor_file = IMAGE_PROCESSOR_NAME
+            try:
+                # Load from local folder or from cache or download from model Hub and cache
+                resolved_image_processor_file = cached_file(
+                    pretrained_model_name_or_path,
+                    image_processor_file,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    local_files_only=local_files_only,
+                    token=token,
+                    user_agent=user_agent,
+                    revision=revision,
+                    subfolder=subfolder,
+                )
+            except EnvironmentError:
+                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
+                # the original exception.
+                raise
+            except Exception:
+                # For any other exception, we throw a generic error.
+                raise EnvironmentError(
+                    f"Can't load image processor for '{pretrained_model_name_or_path}'. If you were trying to load"
+                    " it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
+                    f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
+                    f" directory containing a {IMAGE_PROCESSOR_NAME} file"
+                )
+
+        try:
+            # Load image_processor dict
+            with open(resolved_image_processor_file, "r", encoding="utf-8") as reader:
+                text = reader.read()
+            image_processor_dict = json.loads(text)
+
+        except json.JSONDecodeError:
+            raise EnvironmentError(
+                f"It looks like the config file at '{resolved_image_processor_file}' is not a valid JSON file."
+            )
+
+        if is_local:
+            logger.info(f"loading configuration file {resolved_image_processor_file}")
+        else:
+            logger.info(
+                f"loading configuration file {image_processor_file} from cache at {resolved_image_processor_file}"
+            )
+
+        if not is_local:
+            if "auto_map" in image_processor_dict:
+                image_processor_dict["auto_map"] = add_model_info_to_auto_map(
+                    image_processor_dict["auto_map"], pretrained_model_name_or_path
+                )
+            if "custom_pipelines" in image_processor_dict:
+                image_processor_dict["custom_pipelines"] = add_model_info_to_custom_pipelines(
+                    image_processor_dict["custom_pipelines"], pretrained_model_name_or_path
+                )
+        return image_processor_dict, kwargs
+
+    @classmethod
+    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
+        """
+        Instantiates a type of [`~image_processing_utils.ImageProcessingMixin`] from a Python dictionary of parameters.
+
+        Args:
+            image_processor_dict (`Dict[str, Any]`):
+                Dictionary that will be used to instantiate the image processor object. Such a dictionary can be
+                retrieved from a pretrained checkpoint by leveraging the
+                [`~image_processing_utils.ImageProcessingMixin.to_dict`] method.
+            kwargs (`Dict[str, Any]`):
+                Additional parameters from which to initialize the image processor object.
+
+        Returns:
+            [`~image_processing_utils.ImageProcessingMixin`]: The image processor object instantiated from those
+            parameters.
+        """
+        image_processor_dict = image_processor_dict.copy()
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+
+        # The `size` parameter is a dict and was previously an int or tuple in feature extractors.
+        # We set `size` here directly to the `image_processor_dict` so that it is converted to the appropriate
+        # dict within the image processor and isn't overwritten if `size` is passed in as a kwarg.
+        if "size" in kwargs and "size" in image_processor_dict:
+            image_processor_dict["size"] = kwargs.pop("size")
+        if "crop_size" in kwargs and "crop_size" in image_processor_dict:
+            image_processor_dict["crop_size"] = kwargs.pop("crop_size")
+
+        image_processor = cls(**image_processor_dict)
+
+        # Update image_processor with kwargs if needed
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(image_processor, key):
+                setattr(image_processor, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        logger.info(f"Image processor {image_processor}")
+        if return_unused_kwargs:
+            return image_processor, kwargs
+        else:
+            return image_processor
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this image processor instance.
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["image_processor_type"] = self.__class__.__name__
+
+        return output
+
+    @classmethod
+    def from_json_file(cls, json_file: Union[str, os.PathLike]):
+        """
+        Instantiates a image processor of type [`~image_processing_utils.ImageProcessingMixin`] from the path to a JSON
+        file of parameters.
+
+        Args:
+            json_file (`str` or `os.PathLike`):
+                Path to the JSON file containing the parameters.
+
+        Returns:
+            A image processor of type [`~image_processing_utils.ImageProcessingMixin`]: The image_processor object
+            instantiated from that JSON file.
+        """
+        with open(json_file, "r", encoding="utf-8") as reader:
+            text = reader.read()
+        image_processor_dict = json.loads(text)
+        return cls(**image_processor_dict)
+
+    def to_json_string(self) -> str:
+        """
+        Serializes this instance to a JSON string.
+
+        Returns:
+            `str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
+        """
+        dictionary = self.to_dict()
+
+        for key, value in dictionary.items():
+            if isinstance(value, np.ndarray):
+                dictionary[key] = value.tolist()
+
+        # make sure private name "_processor_class" is correctly
+        # saved as "processor_class"
+        _processor_class = dictionary.pop("_processor_class", None)
+        if _processor_class is not None:
+            dictionary["processor_class"] = _processor_class
+
+        return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
+        """
+        Save this instance to a JSON file.
+
+        Args:
+            json_file_path (`str` or `os.PathLike`):
+                Path to the JSON file in which this image_processor instance's parameters will be saved.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            writer.write(self.to_json_string())
+
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+
+    @classmethod
+    def register_for_auto_class(cls, auto_class="AutoImageProcessor"):
+        """
+        Register this class with a given auto class. This should only be used for custom image processors as the ones
+        in the library are already mapped with `AutoImageProcessor `.
+
+        <Tip warning={true}>
+
+        This API is experimental and may have some slight breaking changes in the next releases.
+
+        </Tip>
+
+        Args:
+            auto_class (`str` or `type`, *optional*, defaults to `"AutoImageProcessor "`):
+                The auto class to register this new image processor with.
+        """
+        if not isinstance(auto_class, str):
+            auto_class = auto_class.__name__
+
+        import transformers.models.auto as auto_module
+
+        if not hasattr(auto_module, auto_class):
+            raise ValueError(f"{auto_class} is not a valid auto class.")
+
+        cls._auto_class = auto_class
+
+    def fetch_images(self, image_url_or_urls: Union[str, List[str]]):
+        """
+        Convert a single or a list of urls into the corresponding `PIL.Image` objects.
+
+        If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
+        returned.
+        """
+        headers = {
+            "User-Agent": (
+                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0"
+                " Safari/537.36"
+            )
+        }
+        if isinstance(image_url_or_urls, list):
+            return [self.fetch_images(x) for x in image_url_or_urls]
+        elif isinstance(image_url_or_urls, str):
+            response = requests.get(image_url_or_urls, stream=True, headers=headers)
+            response.raise_for_status()
+            return Image.open(BytesIO(response.content))
+        else:
+            raise ValueError(f"only a single or a list of entries is supported but got type={type(image_url_or_urls)}")
+
+
+ImageProcessingMixin.push_to_hub = copy_func(ImageProcessingMixin.push_to_hub)
+if ImageProcessingMixin.push_to_hub.__doc__ is not None:
+    ImageProcessingMixin.push_to_hub.__doc__ = ImageProcessingMixin.push_to_hub.__doc__.format(
+        object="image processor", object_class="AutoImageProcessor", object_files="image processor file"
+    )
diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index d60b5c6f80..4b263446b5 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -13,38 +13,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
-import json
-import os
-import warnings
-from io import BytesIO
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Dict, Iterable, Optional, Union
 
 import numpy as np
-import requests
 
-from .dynamic_module_utils import custom_object_save
-from .feature_extraction_utils import BatchFeature as BaseBatchFeature
+from .image_processing_base import BatchFeature, ImageProcessingMixin
 from .image_transforms import center_crop, normalize, rescale
 from .image_utils import ChannelDimension
-from .utils import (
-    IMAGE_PROCESSOR_NAME,
-    PushToHubMixin,
-    add_model_info_to_auto_map,
-    add_model_info_to_custom_pipelines,
-    cached_file,
-    copy_func,
-    download_url,
-    is_offline_mode,
-    is_remote_url,
-    is_vision_available,
-    logging,
-)
+from .utils import logging
 
 
-if is_vision_available():
-    from PIL import Image
-
 logger = logging.get_logger(__name__)
 
 
@@ -54,505 +32,6 @@ INIT_SERVICE_KWARGS = [
 ]
 
 
-# TODO: Move BatchFeature to be imported by both image_processing_utils and image_processing_utils
-# We override the class string here, but logic is the same.
-class BatchFeature(BaseBatchFeature):
-    r"""
-    Holds the output of the image processor specific `__call__` methods.
-
-    This class is derived from a python dictionary and can be used as a dictionary.
-
-    Args:
-        data (`dict`):
-            Dictionary of lists/arrays/tensors returned by the __call__ method ('pixel_values', etc.).
-        tensor_type (`Union[None, str, TensorType]`, *optional*):
-            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
-            initialization.
-    """
-
-
-# TODO: (Amy) - factor out the common parts of this and the feature extractor
-class ImageProcessingMixin(PushToHubMixin):
-    """
-    This is an image processor mixin used to provide saving/loading functionality for sequential and image feature
-    extractors.
-    """
-
-    _auto_class = None
-
-    def __init__(self, **kwargs):
-        """Set elements of `kwargs` as attributes."""
-        # This key was saved while we still used `XXXFeatureExtractor` for image processing. Now we use
-        # `XXXImageProcessor`, this attribute and its value are misleading.
-        kwargs.pop("feature_extractor_type", None)
-        # Pop "processor_class" as it should be saved as private attribute
-        self._processor_class = kwargs.pop("processor_class", None)
-        # Additional attributes without default values
-        for key, value in kwargs.items():
-            try:
-                setattr(self, key, value)
-            except AttributeError as err:
-                logger.error(f"Can't set {key} with value {value} for {self}")
-                raise err
-
-    def _set_processor_class(self, processor_class: str):
-        """Sets processor class as an attribute."""
-        self._processor_class = processor_class
-
-    @classmethod
-    def from_pretrained(
-        cls,
-        pretrained_model_name_or_path: Union[str, os.PathLike],
-        cache_dir: Optional[Union[str, os.PathLike]] = None,
-        force_download: bool = False,
-        local_files_only: bool = False,
-        token: Optional[Union[str, bool]] = None,
-        revision: str = "main",
-        **kwargs,
-    ):
-        r"""
-        Instantiate a type of [`~image_processing_utils.ImageProcessingMixin`] from an image processor.
-
-        Args:
-            pretrained_model_name_or_path (`str` or `os.PathLike`):
-                This can be either:
-
-                - a string, the *model id* of a pretrained image_processor hosted inside a model repo on
-                  huggingface.co.
-                - a path to a *directory* containing a image processor file saved using the
-                  [`~image_processing_utils.ImageProcessingMixin.save_pretrained`] method, e.g.,
-                  `./my_model_directory/`.
-                - a path or url to a saved image processor JSON *file*, e.g.,
-                  `./my_model_directory/preprocessor_config.json`.
-            cache_dir (`str` or `os.PathLike`, *optional*):
-                Path to a directory in which a downloaded pretrained model image processor should be cached if the
-                standard cache should not be used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force to (re-)download the image processor files and override the cached versions if
-                they exist.
-            resume_download:
-                Deprecated and ignored. All downloads are now resumed by default when possible.
-                Will be removed in v5 of Transformers.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-            token (`str` or `bool`, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
-                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-                identifier allowed by git.
-
-
-                <Tip>
-
-                To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".
-
-                </Tip>
-
-            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
-                If `False`, then this function returns just the final image processor object. If `True`, then this
-                functions returns a `Tuple(image_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
-                consisting of the key/value pairs whose keys are not image processor attributes: i.e., the part of
-                `kwargs` which has not been used to update `image_processor` and is otherwise ignored.
-            subfolder (`str`, *optional*, defaults to `""`):
-                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
-                specify the folder name here.
-            kwargs (`Dict[str, Any]`, *optional*):
-                The values in kwargs of any keys which are image processor attributes will be used to override the
-                loaded values. Behavior concerning key/value pairs whose keys are *not* image processor attributes is
-                controlled by the `return_unused_kwargs` keyword parameter.
-
-        Returns:
-            A image processor of type [`~image_processing_utils.ImageProcessingMixin`].
-
-        Examples:
-
-        ```python
-        # We can't instantiate directly the base class *ImageProcessingMixin* so let's show the examples on a
-        # derived class: *CLIPImageProcessor*
-        image_processor = CLIPImageProcessor.from_pretrained(
-            "openai/clip-vit-base-patch32"
-        )  # Download image_processing_config from huggingface.co and cache.
-        image_processor = CLIPImageProcessor.from_pretrained(
-            "./test/saved_model/"
-        )  # E.g. image processor (or model) was saved using *save_pretrained('./test/saved_model/')*
-        image_processor = CLIPImageProcessor.from_pretrained("./test/saved_model/preprocessor_config.json")
-        image_processor = CLIPImageProcessor.from_pretrained(
-            "openai/clip-vit-base-patch32", do_normalize=False, foo=False
-        )
-        assert image_processor.do_normalize is False
-        image_processor, unused_kwargs = CLIPImageProcessor.from_pretrained(
-            "openai/clip-vit-base-patch32", do_normalize=False, foo=False, return_unused_kwargs=True
-        )
-        assert image_processor.do_normalize is False
-        assert unused_kwargs == {"foo": False}
-        ```"""
-        kwargs["cache_dir"] = cache_dir
-        kwargs["force_download"] = force_download
-        kwargs["local_files_only"] = local_files_only
-        kwargs["revision"] = revision
-
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        if use_auth_token is not None:
-            warnings.warn(
-                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
-                FutureWarning,
-            )
-            if token is not None:
-                raise ValueError(
-                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
-                )
-            token = use_auth_token
-
-        if token is not None:
-            kwargs["token"] = token
-
-        image_processor_dict, kwargs = cls.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
-
-        return cls.from_dict(image_processor_dict, **kwargs)
-
-    def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
-        """
-        Save an image processor object to the directory `save_directory`, so that it can be re-loaded using the
-        [`~image_processing_utils.ImageProcessingMixin.from_pretrained`] class method.
-
-        Args:
-            save_directory (`str` or `os.PathLike`):
-                Directory where the image processor JSON file will be saved (will be created if it does not exist).
-            push_to_hub (`bool`, *optional*, defaults to `False`):
-                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
-                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
-                namespace).
-            kwargs (`Dict[str, Any]`, *optional*):
-                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
-        """
-        use_auth_token = kwargs.pop("use_auth_token", None)
-
-        if use_auth_token is not None:
-            warnings.warn(
-                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
-                FutureWarning,
-            )
-            if kwargs.get("token", None) is not None:
-                raise ValueError(
-                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
-                )
-            kwargs["token"] = use_auth_token
-
-        if os.path.isfile(save_directory):
-            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
-
-        os.makedirs(save_directory, exist_ok=True)
-
-        if push_to_hub:
-            commit_message = kwargs.pop("commit_message", None)
-            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
-            repo_id = self._create_repo(repo_id, **kwargs)
-            files_timestamps = self._get_files_timestamps(save_directory)
-
-        # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
-        # loaded from the Hub.
-        if self._auto_class is not None:
-            custom_object_save(self, save_directory, config=self)
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_image_processor_file = os.path.join(save_directory, IMAGE_PROCESSOR_NAME)
-
-        self.to_json_file(output_image_processor_file)
-        logger.info(f"Image processor saved in {output_image_processor_file}")
-
-        if push_to_hub:
-            self._upload_modified_files(
-                save_directory,
-                repo_id,
-                files_timestamps,
-                commit_message=commit_message,
-                token=kwargs.get("token"),
-            )
-
-        return [output_image_processor_file]
-
-    @classmethod
-    def get_image_processor_dict(
-        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
-    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        """
-        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
-        image processor of type [`~image_processor_utils.ImageProcessingMixin`] using `from_dict`.
-
-        Parameters:
-            pretrained_model_name_or_path (`str` or `os.PathLike`):
-                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
-            subfolder (`str`, *optional*, defaults to `""`):
-                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
-                specify the folder name here.
-
-        Returns:
-            `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object.
-        """
-        cache_dir = kwargs.pop("cache_dir", None)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", None)
-        proxies = kwargs.pop("proxies", None)
-        token = kwargs.pop("token", None)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        local_files_only = kwargs.pop("local_files_only", False)
-        revision = kwargs.pop("revision", None)
-        subfolder = kwargs.pop("subfolder", "")
-
-        from_pipeline = kwargs.pop("_from_pipeline", None)
-        from_auto_class = kwargs.pop("_from_auto", False)
-
-        if use_auth_token is not None:
-            warnings.warn(
-                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
-                FutureWarning,
-            )
-            if token is not None:
-                raise ValueError(
-                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
-                )
-            token = use_auth_token
-
-        user_agent = {"file_type": "image processor", "from_auto_class": from_auto_class}
-        if from_pipeline is not None:
-            user_agent["using_pipeline"] = from_pipeline
-
-        if is_offline_mode() and not local_files_only:
-            logger.info("Offline mode: forcing local_files_only=True")
-            local_files_only = True
-
-        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-        is_local = os.path.isdir(pretrained_model_name_or_path)
-        if os.path.isdir(pretrained_model_name_or_path):
-            image_processor_file = os.path.join(pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME)
-        if os.path.isfile(pretrained_model_name_or_path):
-            resolved_image_processor_file = pretrained_model_name_or_path
-            is_local = True
-        elif is_remote_url(pretrained_model_name_or_path):
-            image_processor_file = pretrained_model_name_or_path
-            resolved_image_processor_file = download_url(pretrained_model_name_or_path)
-        else:
-            image_processor_file = IMAGE_PROCESSOR_NAME
-            try:
-                # Load from local folder or from cache or download from model Hub and cache
-                resolved_image_processor_file = cached_file(
-                    pretrained_model_name_or_path,
-                    image_processor_file,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    proxies=proxies,
-                    resume_download=resume_download,
-                    local_files_only=local_files_only,
-                    token=token,
-                    user_agent=user_agent,
-                    revision=revision,
-                    subfolder=subfolder,
-                )
-            except EnvironmentError:
-                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
-                # the original exception.
-                raise
-            except Exception:
-                # For any other exception, we throw a generic error.
-                raise EnvironmentError(
-                    f"Can't load image processor for '{pretrained_model_name_or_path}'. If you were trying to load"
-                    " it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
-                    f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
-                    f" directory containing a {IMAGE_PROCESSOR_NAME} file"
-                )
-
-        try:
-            # Load image_processor dict
-            with open(resolved_image_processor_file, "r", encoding="utf-8") as reader:
-                text = reader.read()
-            image_processor_dict = json.loads(text)
-
-        except json.JSONDecodeError:
-            raise EnvironmentError(
-                f"It looks like the config file at '{resolved_image_processor_file}' is not a valid JSON file."
-            )
-
-        if is_local:
-            logger.info(f"loading configuration file {resolved_image_processor_file}")
-        else:
-            logger.info(
-                f"loading configuration file {image_processor_file} from cache at {resolved_image_processor_file}"
-            )
-
-        if not is_local:
-            if "auto_map" in image_processor_dict:
-                image_processor_dict["auto_map"] = add_model_info_to_auto_map(
-                    image_processor_dict["auto_map"], pretrained_model_name_or_path
-                )
-            if "custom_pipelines" in image_processor_dict:
-                image_processor_dict["custom_pipelines"] = add_model_info_to_custom_pipelines(
-                    image_processor_dict["custom_pipelines"], pretrained_model_name_or_path
-                )
-        return image_processor_dict, kwargs
-
-    @classmethod
-    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
-        """
-        Instantiates a type of [`~image_processing_utils.ImageProcessingMixin`] from a Python dictionary of parameters.
-
-        Args:
-            image_processor_dict (`Dict[str, Any]`):
-                Dictionary that will be used to instantiate the image processor object. Such a dictionary can be
-                retrieved from a pretrained checkpoint by leveraging the
-                [`~image_processing_utils.ImageProcessingMixin.to_dict`] method.
-            kwargs (`Dict[str, Any]`):
-                Additional parameters from which to initialize the image processor object.
-
-        Returns:
-            [`~image_processing_utils.ImageProcessingMixin`]: The image processor object instantiated from those
-            parameters.
-        """
-        image_processor_dict = image_processor_dict.copy()
-        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
-
-        # The `size` parameter is a dict and was previously an int or tuple in feature extractors.
-        # We set `size` here directly to the `image_processor_dict` so that it is converted to the appropriate
-        # dict within the image processor and isn't overwritten if `size` is passed in as a kwarg.
-        if "size" in kwargs and "size" in image_processor_dict:
-            image_processor_dict["size"] = kwargs.pop("size")
-        if "crop_size" in kwargs and "crop_size" in image_processor_dict:
-            image_processor_dict["crop_size"] = kwargs.pop("crop_size")
-
-        image_processor = cls(**image_processor_dict)
-
-        # Update image_processor with kwargs if needed
-        to_remove = []
-        for key, value in kwargs.items():
-            if hasattr(image_processor, key):
-                setattr(image_processor, key, value)
-                to_remove.append(key)
-        for key in to_remove:
-            kwargs.pop(key, None)
-
-        logger.info(f"Image processor {image_processor}")
-        if return_unused_kwargs:
-            return image_processor, kwargs
-        else:
-            return image_processor
-
-    def to_dict(self) -> Dict[str, Any]:
-        """
-        Serializes this instance to a Python dictionary.
-
-        Returns:
-            `Dict[str, Any]`: Dictionary of all the attributes that make up this image processor instance.
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["image_processor_type"] = self.__class__.__name__
-
-        return output
-
-    @classmethod
-    def from_json_file(cls, json_file: Union[str, os.PathLike]):
-        """
-        Instantiates a image processor of type [`~image_processing_utils.ImageProcessingMixin`] from the path to a JSON
-        file of parameters.
-
-        Args:
-            json_file (`str` or `os.PathLike`):
-                Path to the JSON file containing the parameters.
-
-        Returns:
-            A image processor of type [`~image_processing_utils.ImageProcessingMixin`]: The image_processor object
-            instantiated from that JSON file.
-        """
-        with open(json_file, "r", encoding="utf-8") as reader:
-            text = reader.read()
-        image_processor_dict = json.loads(text)
-        return cls(**image_processor_dict)
-
-    def to_json_string(self) -> str:
-        """
-        Serializes this instance to a JSON string.
-
-        Returns:
-            `str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
-        """
-        dictionary = self.to_dict()
-
-        for key, value in dictionary.items():
-            if isinstance(value, np.ndarray):
-                dictionary[key] = value.tolist()
-
-        # make sure private name "_processor_class" is correctly
-        # saved as "processor_class"
-        _processor_class = dictionary.pop("_processor_class", None)
-        if _processor_class is not None:
-            dictionary["processor_class"] = _processor_class
-
-        return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
-
-    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
-        """
-        Save this instance to a JSON file.
-
-        Args:
-            json_file_path (`str` or `os.PathLike`):
-                Path to the JSON file in which this image_processor instance's parameters will be saved.
-        """
-        with open(json_file_path, "w", encoding="utf-8") as writer:
-            writer.write(self.to_json_string())
-
-    def __repr__(self):
-        return f"{self.__class__.__name__} {self.to_json_string()}"
-
-    @classmethod
-    def register_for_auto_class(cls, auto_class="AutoImageProcessor"):
-        """
-        Register this class with a given auto class. This should only be used for custom image processors as the ones
-        in the library are already mapped with `AutoImageProcessor `.
-
-        <Tip warning={true}>
-
-        This API is experimental and may have some slight breaking changes in the next releases.
-
-        </Tip>
-
-        Args:
-            auto_class (`str` or `type`, *optional*, defaults to `"AutoImageProcessor "`):
-                The auto class to register this new image processor with.
-        """
-        if not isinstance(auto_class, str):
-            auto_class = auto_class.__name__
-
-        import transformers.models.auto as auto_module
-
-        if not hasattr(auto_module, auto_class):
-            raise ValueError(f"{auto_class} is not a valid auto class.")
-
-        cls._auto_class = auto_class
-
-    def fetch_images(self, image_url_or_urls: Union[str, List[str]]):
-        """
-        Convert a single or a list of urls into the corresponding `PIL.Image` objects.
-
-        If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
-        returned.
-        """
-        headers = {
-            "User-Agent": (
-                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0"
-                " Safari/537.36"
-            )
-        }
-        if isinstance(image_url_or_urls, list):
-            return [self.fetch_images(x) for x in image_url_or_urls]
-        elif isinstance(image_url_or_urls, str):
-            response = requests.get(image_url_or_urls, stream=True, headers=headers)
-            response.raise_for_status()
-            return Image.open(BytesIO(response.content))
-        else:
-            raise ValueError(f"only a single or a list of entries is supported but got type={type(image_url_or_urls)}")
-
-
 class BaseImageProcessor(ImageProcessingMixin):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
@@ -801,10 +280,3 @@ def select_best_resolution(original_size: tuple, possible_resolutions: list) ->
             best_fit = (height, width)
 
     return best_fit
-
-
-ImageProcessingMixin.push_to_hub = copy_func(ImageProcessingMixin.push_to_hub)
-if ImageProcessingMixin.push_to_hub.__doc__ is not None:
-    ImageProcessingMixin.push_to_hub.__doc__ = ImageProcessingMixin.push_to_hub.__doc__.format(
-        object="image processor", object_class="AutoImageProcessor", object_files="image processor file"
-    )
diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py
new file mode 100644
index 0000000000..daeee3e1bd
--- /dev/null
+++ b/src/transformers/image_processing_utils_fast.py
@@ -0,0 +1,63 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+from dataclasses import dataclass
+
+from .image_processing_utils import BaseImageProcessor
+from .utils.import_utils import is_torchvision_available
+
+
+if is_torchvision_available():
+    from torchvision.transforms import Compose
+
+
+@dataclass(frozen=True)
+class SizeDict:
+    """
+    Hashable dictionary to store image size information.
+    """
+
+    height: int = None
+    width: int = None
+    longest_edge: int = None
+    shortest_edge: int = None
+    max_height: int = None
+    max_width: int = None
+
+    def __getitem__(self, key):
+        if hasattr(self, key):
+            return getattr(self, key)
+        raise KeyError(f"Key {key} not found in SizeDict.")
+
+
+class BaseImageProcessorFast(BaseImageProcessor):
+    _transform_params = None
+
+    def _build_transforms(self, **kwargs) -> "Compose":
+        """
+        Given the input settings e.g. do_resize, build the image transforms.
+        """
+        raise NotImplementedError
+
+    def _validate_params(self, **kwargs) -> None:
+        for k, v in kwargs.items():
+            if k not in self._transform_params:
+                raise ValueError(f"Invalid transform parameter {k}={v}.")
+
+    @functools.lru_cache(maxsize=1)
+    def get_transforms(self, **kwargs) -> "Compose":
+        self._validate_params(**kwargs)
+        return self._build_transforms(**kwargs)
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 65d6413db7..4e4812879e 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -31,6 +31,7 @@ from .utils.import_utils import (
     is_flax_available,
     is_tf_available,
     is_torch_available,
+    is_torchvision_available,
     is_vision_available,
     requires_backends,
 )
@@ -50,6 +51,9 @@ if is_tf_available():
 if is_flax_available():
     import jax.numpy as jnp
 
+if is_torchvision_available():
+    from torchvision.transforms import functional as F
+
 
 def to_channel_dimension_format(
     image: np.ndarray,
@@ -374,6 +378,7 @@ def normalize(
 
     if input_data_format is None:
         input_data_format = infer_channel_dimension_format(image)
+
     channel_axis = get_channel_dimension_axis(image, input_data_format=input_data_format)
     num_channels = image.shape[channel_axis]
 
@@ -802,3 +807,48 @@ def flip_channel_order(
     if data_format is not None:
         image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
     return image
+
+
+def _cast_tensor_to_float(x):
+    if x.is_floating_point():
+        return x
+    return x.float()
+
+
+class FusedRescaleNormalize:
+    """
+    Rescale and normalize the input image in one step.
+    """
+
+    def __init__(self, mean, std, rescale_factor: float = 1.0, inplace: bool = False):
+        self.mean = torch.tensor(mean) * (1.0 / rescale_factor)
+        self.std = torch.tensor(std) * (1.0 / rescale_factor)
+        self.inplace = inplace
+
+    def __call__(self, image: "torch.Tensor"):
+        image = _cast_tensor_to_float(image)
+        return F.normalize(image, self.mean, self.std, inplace=self.inplace)
+
+
+class Rescale:
+    """
+    Rescale the input image by rescale factor: image *= rescale_factor.
+    """
+
+    def __init__(self, rescale_factor: float = 1.0):
+        self.rescale_factor = rescale_factor
+
+    def __call__(self, image: "torch.Tensor"):
+        image = image * self.rescale_factor
+        return image
+
+
+class NumpyToTensor:
+    """
+    Convert a numpy array to a PyTorch tensor.
+    """
+
+    def __call__(self, image: np.ndarray):
+        # Same as in PyTorch, we assume incoming numpy images are in HWC format
+        # c.f. https://github.com/pytorch/vision/blob/61d97f41bc209e1407dcfbd685d2ee2da9c1cdad/torchvision/transforms/functional.py#L154
+        return torch.from_numpy(image.transpose(2, 0, 1)).contiguous()
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index aaa9e4eadc..aa09e74558 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -25,9 +25,11 @@ from packaging import version
 from .utils import (
     ExplicitEnum,
     is_jax_tensor,
+    is_numpy_array,
     is_tf_tensor,
     is_torch_available,
     is_torch_tensor,
+    is_torchvision_available,
     is_vision_available,
     logging,
     requires_backends,
@@ -52,6 +54,20 @@ if is_vision_available():
     else:
         PILImageResampling = PIL.Image
 
+    if is_torchvision_available():
+        from torchvision.transforms import InterpolationMode
+
+        pil_torch_interpolation_mapping = {
+            PILImageResampling.NEAREST: InterpolationMode.NEAREST,
+            PILImageResampling.BOX: InterpolationMode.BOX,
+            PILImageResampling.BILINEAR: InterpolationMode.BILINEAR,
+            PILImageResampling.HAMMING: InterpolationMode.HAMMING,
+            PILImageResampling.BICUBIC: InterpolationMode.BICUBIC,
+            PILImageResampling.LANCZOS: InterpolationMode.LANCZOS,
+            PILImageResampling.NEAREST: InterpolationMode.NEAREST,
+        }
+
+
 if TYPE_CHECKING:
     if is_torch_available():
         import torch
@@ -90,14 +106,30 @@ def is_pil_image(img):
     return is_vision_available() and isinstance(img, PIL.Image.Image)
 
 
+class ImageType(ExplicitEnum):
+    PIL = "pillow"
+    TORCH = "torch"
+    NUMPY = "numpy"
+    TENSORFLOW = "tensorflow"
+    JAX = "jax"
+
+
+def get_image_type(image):
+    if is_pil_image(image):
+        return ImageType.PIL
+    if is_torch_tensor(image):
+        return ImageType.TORCH
+    if is_numpy_array(image):
+        return ImageType.NUMPY
+    if is_tf_tensor(image):
+        return ImageType.TENSORFLOW
+    if is_jax_tensor(image):
+        return ImageType.JAX
+    raise ValueError(f"Unrecognised image type {type(image)}")
+
+
 def is_valid_image(img):
-    return (
-        (is_vision_available() and isinstance(img, PIL.Image.Image))
-        or isinstance(img, np.ndarray)
-        or is_torch_tensor(img)
-        or is_tf_tensor(img)
-        or is_jax_tensor(img)
-    )
+    return is_pil_image(img) or is_numpy_array(img) or is_torch_tensor(img) or is_tf_tensor(img) or is_jax_tensor(img)
 
 
 def valid_images(imgs):
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index eb21b58e20..b316a1a55d 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -19,13 +19,21 @@ import json
 import os
 import warnings
 from collections import OrderedDict
-from typing import Dict, Optional, Union
+from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
 
 # Build the list of all image processors
 from ...configuration_utils import PretrainedConfig
 from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
-from ...image_processing_utils import ImageProcessingMixin
-from ...utils import CONFIG_NAME, IMAGE_PROCESSOR_NAME, get_file_from_repo, logging
+from ...image_processing_utils import BaseImageProcessor, ImageProcessingMixin
+from ...image_processing_utils_fast import BaseImageProcessorFast
+from ...utils import (
+    CONFIG_NAME,
+    IMAGE_PROCESSOR_NAME,
+    get_file_from_repo,
+    is_torchvision_available,
+    is_vision_available,
+    logging,
+)
 from .auto_factory import _LazyAutoMapping
 from .configuration_auto import (
     CONFIG_MAPPING_NAMES,
@@ -37,104 +45,125 @@ from .configuration_auto import (
 
 logger = logging.get_logger(__name__)
 
-IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
-    [
-        ("align", "EfficientNetImageProcessor"),
-        ("beit", "BeitImageProcessor"),
-        ("bit", "BitImageProcessor"),
-        ("blip", "BlipImageProcessor"),
-        ("blip-2", "BlipImageProcessor"),
-        ("bridgetower", "BridgeTowerImageProcessor"),
-        ("chinese_clip", "ChineseCLIPImageProcessor"),
-        ("clip", "CLIPImageProcessor"),
-        ("clipseg", "ViTImageProcessor"),
-        ("conditional_detr", "ConditionalDetrImageProcessor"),
-        ("convnext", "ConvNextImageProcessor"),
-        ("convnextv2", "ConvNextImageProcessor"),
-        ("cvt", "ConvNextImageProcessor"),
-        ("data2vec-vision", "BeitImageProcessor"),
-        ("deformable_detr", "DeformableDetrImageProcessor"),
-        ("deit", "DeiTImageProcessor"),
-        ("depth_anything", "DPTImageProcessor"),
-        ("deta", "DetaImageProcessor"),
-        ("detr", "DetrImageProcessor"),
-        ("dinat", "ViTImageProcessor"),
-        ("dinov2", "BitImageProcessor"),
-        ("donut-swin", "DonutImageProcessor"),
-        ("dpt", "DPTImageProcessor"),
-        ("efficientformer", "EfficientFormerImageProcessor"),
-        ("efficientnet", "EfficientNetImageProcessor"),
-        ("flava", "FlavaImageProcessor"),
-        ("focalnet", "BitImageProcessor"),
-        ("fuyu", "FuyuImageProcessor"),
-        ("git", "CLIPImageProcessor"),
-        ("glpn", "GLPNImageProcessor"),
-        ("grounding-dino", "GroundingDinoImageProcessor"),
-        ("groupvit", "CLIPImageProcessor"),
-        ("idefics", "IdeficsImageProcessor"),
-        ("idefics2", "Idefics2ImageProcessor"),
-        ("imagegpt", "ImageGPTImageProcessor"),
-        ("instructblip", "BlipImageProcessor"),
-        ("kosmos-2", "CLIPImageProcessor"),
-        ("layoutlmv2", "LayoutLMv2ImageProcessor"),
-        ("layoutlmv3", "LayoutLMv3ImageProcessor"),
-        ("levit", "LevitImageProcessor"),
-        ("llava", "CLIPImageProcessor"),
-        ("llava_next", "LlavaNextImageProcessor"),
-        ("mask2former", "Mask2FormerImageProcessor"),
-        ("maskformer", "MaskFormerImageProcessor"),
-        ("mgp-str", "ViTImageProcessor"),
-        ("mobilenet_v1", "MobileNetV1ImageProcessor"),
-        ("mobilenet_v2", "MobileNetV2ImageProcessor"),
-        ("mobilevit", "MobileViTImageProcessor"),
-        ("mobilevit", "MobileViTImageProcessor"),
-        ("mobilevitv2", "MobileViTImageProcessor"),
-        ("nat", "ViTImageProcessor"),
-        ("nougat", "NougatImageProcessor"),
-        ("oneformer", "OneFormerImageProcessor"),
-        ("owlv2", "Owlv2ImageProcessor"),
-        ("owlvit", "OwlViTImageProcessor"),
-        ("paligemma", "CLIPImageProcessor"),
-        ("perceiver", "PerceiverImageProcessor"),
-        ("pix2struct", "Pix2StructImageProcessor"),
-        ("poolformer", "PoolFormerImageProcessor"),
-        ("pvt", "PvtImageProcessor"),
-        ("pvt_v2", "PvtImageProcessor"),
-        ("regnet", "ConvNextImageProcessor"),
-        ("resnet", "ConvNextImageProcessor"),
-        ("sam", "SamImageProcessor"),
-        ("segformer", "SegformerImageProcessor"),
-        ("seggpt", "SegGptImageProcessor"),
-        ("siglip", "SiglipImageProcessor"),
-        ("swiftformer", "ViTImageProcessor"),
-        ("swin", "ViTImageProcessor"),
-        ("swin2sr", "Swin2SRImageProcessor"),
-        ("swinv2", "ViTImageProcessor"),
-        ("table-transformer", "DetrImageProcessor"),
-        ("timesformer", "VideoMAEImageProcessor"),
-        ("tvlt", "TvltImageProcessor"),
-        ("tvp", "TvpImageProcessor"),
-        ("udop", "LayoutLMv3ImageProcessor"),
-        ("upernet", "SegformerImageProcessor"),
-        ("van", "ConvNextImageProcessor"),
-        ("video_llava", "VideoLlavaImageProcessor"),
-        ("videomae", "VideoMAEImageProcessor"),
-        ("vilt", "ViltImageProcessor"),
-        ("vipllava", "CLIPImageProcessor"),
-        ("vit", "ViTImageProcessor"),
-        ("vit_hybrid", "ViTHybridImageProcessor"),
-        ("vit_mae", "ViTImageProcessor"),
-        ("vit_msn", "ViTImageProcessor"),
-        ("vitmatte", "VitMatteImageProcessor"),
-        ("xclip", "CLIPImageProcessor"),
-        ("yolos", "YolosImageProcessor"),
-    ]
-)
+
+if TYPE_CHECKING:
+    # This significantly improves completion suggestion performance when
+    # the transformers package is used with Microsoft's Pylance language server.
+    IMAGE_PROCESSOR_MAPPING_NAMES: OrderedDict[str, Tuple[Optional[str], Optional[str]]] = OrderedDict()
+else:
+    IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
+        [
+            ("align", ("EfficientNetImageProcessor",)),
+            ("beit", ("BeitImageProcessor",)),
+            ("bit", ("BitImageProcessor",)),
+            ("blip", ("BlipImageProcessor",)),
+            ("blip-2", ("BlipImageProcessor",)),
+            ("bridgetower", ("BridgeTowerImageProcessor",)),
+            ("chinese_clip", ("ChineseCLIPImageProcessor",)),
+            ("clip", ("CLIPImageProcessor",)),
+            ("clipseg", ("ViTImageProcessor", "ViTImageProcessorFast")),
+            ("conditional_detr", ("ConditionalDetrImageProcessor",)),
+            ("convnext", ("ConvNextImageProcessor",)),
+            ("convnextv2", ("ConvNextImageProcessor",)),
+            ("cvt", ("ConvNextImageProcessor",)),
+            ("data2vec-vision", ("BeitImageProcessor",)),
+            ("deformable_detr", ("DeformableDetrImageProcessor",)),
+            ("deit", ("DeiTImageProcessor",)),
+            ("depth_anything", ("DPTImageProcessor",)),
+            ("deta", ("DetaImageProcessor",)),
+            ("detr", ("DetrImageProcessor",)),
+            ("dinat", ("ViTImageProcessor", "ViTImageProcessorFast")),
+            ("dinov2", ("BitImageProcessor",)),
+            ("donut-swin", ("DonutImageProcessor",)),
+            ("dpt", ("DPTImageProcessor",)),
+            ("efficientformer", ("EfficientFormerImageProcessor",)),
+            ("efficientnet", ("EfficientNetImageProcessor",)),
+            ("flava", ("FlavaImageProcessor",)),
+            ("focalnet", ("BitImageProcessor",)),
+            ("fuyu", ("FuyuImageProcessor",)),
+            ("git", ("CLIPImageProcessor",)),
+            ("glpn", ("GLPNImageProcessor",)),
+            ("grounding-dino", ("GroundingDinoImageProcessor",)),
+            ("groupvit", ("CLIPImageProcessor",)),
+            ("idefics", ("IdeficsImageProcessor",)),
+            ("idefics2", ("Idefics2ImageProcessor",)),
+            ("imagegpt", ("ImageGPTImageProcessor",)),
+            ("instructblip", ("BlipImageProcessor",)),
+            ("kosmos-2", ("CLIPImageProcessor",)),
+            ("layoutlmv2", ("LayoutLMv2ImageProcessor",)),
+            ("layoutlmv3", ("LayoutLMv3ImageProcessor",)),
+            ("levit", ("LevitImageProcessor",)),
+            ("llava", ("CLIPImageProcessor",)),
+            ("llava_next", ("LlavaNextImageProcessor",)),
+            ("mask2former", ("Mask2FormerImageProcessor",)),
+            ("maskformer", ("MaskFormerImageProcessor",)),
+            ("mgp-str", ("ViTImageProcessor", "ViTImageProcessorFast")),
+            ("mobilenet_v1", ("MobileNetV1ImageProcessor",)),
+            ("mobilenet_v2", ("MobileNetV2ImageProcessor",)),
+            ("mobilevit", ("MobileViTImageProcessor",)),
+            ("mobilevit", ("MobileViTImageProcessor",)),
+            ("mobilevitv2", ("MobileViTImageProcessor",)),
+            ("nat", ("ViTImageProcessor", "ViTImageProcessorFast")),
+            ("nougat", ("NougatImageProcessor",)),
+            ("oneformer", ("OneFormerImageProcessor",)),
+            ("owlv2", ("Owlv2ImageProcessor",)),
+            ("owlvit", ("OwlViTImageProcessor",)),
+            ("perceiver", ("PerceiverImageProcessor",)),
+            ("pix2struct", ("Pix2StructImageProcessor",)),
+            ("poolformer", ("PoolFormerImageProcessor",)),
+            ("pvt", ("PvtImageProcessor",)),
+            ("pvt_v2", ("PvtImageProcessor",)),
+            ("regnet", ("ConvNextImageProcessor",)),
+            ("resnet", ("ConvNextImageProcessor",)),
+            ("sam", ("SamImageProcessor",)),
+            ("segformer", ("SegformerImageProcessor",)),
+            ("seggpt", ("SegGptImageProcessor",)),
+            ("siglip", ("SiglipImageProcessor",)),
+            ("swiftformer", ("ViTImageProcessor", "ViTImageProcessorFast")),
+            ("swin", ("ViTImageProcessor", "ViTImageProcessorFast")),
+            ("swin2sr", ("Swin2SRImageProcessor",)),
+            ("swinv2", ("ViTImageProcessor", "ViTImageProcessorFast")),
+            ("table-transformer", ("DetrImageProcessor",)),
+            ("timesformer", ("VideoMAEImageProcessor",)),
+            ("tvlt", ("TvltImageProcessor",)),
+            ("tvp", ("TvpImageProcessor",)),
+            ("udop", ("LayoutLMv3ImageProcessor",)),
+            ("upernet", ("SegformerImageProcessor",)),
+            ("van", ("ConvNextImageProcessor",)),
+            ("videomae", ("VideoMAEImageProcessor",)),
+            ("vilt", ("ViltImageProcessor",)),
+            ("vipllava", ("CLIPImageProcessor",)),
+            ("vit", ("ViTImageProcessor", "ViTImageProcessorFast")),
+            ("vit_hybrid", ("ViTHybridImageProcessor",)),
+            ("vit_mae", ("ViTImageProcessor", "ViTImageProcessorFast")),
+            ("vit_msn", ("ViTImageProcessor", "ViTImageProcessorFast")),
+            ("vitmatte", ("VitMatteImageProcessor",)),
+            ("xclip", ("CLIPImageProcessor",)),
+            ("yolos", ("YolosImageProcessor",)),
+        ]
+    )
+
+for model_type, image_processors in IMAGE_PROCESSOR_MAPPING_NAMES.items():
+    slow_image_processor_class, *fast_image_processor_class = image_processors
+    if not is_vision_available():
+        slow_image_processor_class = None
+
+    # If the fast image processor is not defined, or torchvision is not available, we set it to None
+    if not fast_image_processor_class or fast_image_processor_class[0] is None or not is_torchvision_available():
+        fast_image_processor_class = None
+    else:
+        fast_image_processor_class = fast_image_processor_class[0]
+
+    IMAGE_PROCESSOR_MAPPING_NAMES[model_type] = (slow_image_processor_class, fast_image_processor_class)
+
 
 IMAGE_PROCESSOR_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, IMAGE_PROCESSOR_MAPPING_NAMES)
 
 
 def image_processor_class_from_name(class_name: str):
+    if class_name == "BaseImageProcessorFast":
+        return BaseImageProcessorFast
+
     for module_name, extractors in IMAGE_PROCESSOR_MAPPING_NAMES.items():
         if class_name in extractors:
             module_name = model_type_to_module_name(module_name)
@@ -145,11 +174,12 @@ def image_processor_class_from_name(class_name: str):
             except AttributeError:
                 continue
 
-    for _, extractor in IMAGE_PROCESSOR_MAPPING._extra_content.items():
-        if getattr(extractor, "__name__", None) == class_name:
-            return extractor
+    for _, extractors in IMAGE_PROCESSOR_MAPPING._extra_content.items():
+        for extractor in extractors:
+            if getattr(extractor, "__name__", None) == class_name:
+                return extractor
 
-    # We did not fine the class, but maybe it's because a dep is missing. In that case, the class will be in the main
+    # We did not find the class, but maybe it's because a dep is missing. In that case, the class will be in the main
     # init and we return the proper dummy to get an appropriate error message.
     main_module = importlib.import_module("transformers")
     if hasattr(main_module, class_name):
@@ -258,6 +288,13 @@ def get_image_processor_config(
         return json.load(reader)
 
 
+def _warning_fast_image_processor_available(fast_class):
+    logger.warning(
+        f"Fast image processor class {fast_class} is available for this model. "
+        "Using slow image processor class. To use the fast image processor class set `use_fast=True`."
+    )
+
+
 class AutoImageProcessor:
     r"""
     This is a generic image processor class that will be instantiated as one of the image processor classes of the
@@ -274,7 +311,7 @@ class AutoImageProcessor:
 
     @classmethod
     @replace_list_option_in_docstrings(IMAGE_PROCESSOR_MAPPING_NAMES)
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         r"""
         Instantiate one of the image processor classes of the library from a pretrained model vocabulary.
 
@@ -314,6 +351,10 @@ class AutoImageProcessor:
                 The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                 git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                 identifier allowed by git.
+            use_fast (`bool`, *optional*, defaults to `False`):
+                Use a fast torchvision-base image processor if it is supported for a given model.
+                If a fast tokenizer is not available for a given model, a normal numpy-based image processor
+                is returned instead.
             return_unused_kwargs (`bool`, *optional*, defaults to `False`):
                 If `False`, then this function returns just the final image processor object. If `True`, then this
                 functions returns a `Tuple(image_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
@@ -358,6 +399,7 @@ class AutoImageProcessor:
             kwargs["token"] = use_auth_token
 
         config = kwargs.pop("config", None)
+        use_fast = kwargs.pop("use_fast", False)
         trust_remote_code = kwargs.pop("trust_remote_code", None)
         kwargs["_from_auto"] = True
 
@@ -387,6 +429,11 @@ class AutoImageProcessor:
                 image_processor_auto_map = config.auto_map["AutoImageProcessor"]
 
         if image_processor_class is not None:
+            # Update class name to reflect the use_fast option. If class is not found, None is returned.
+            if use_fast and not image_processor_class.endswith("Fast"):
+                image_processor_class += "Fast"
+            elif not use_fast and image_processor_class.endswith("Fast"):
+                image_processor_class = image_processor_class[:-4]
             image_processor_class = image_processor_class_from_name(image_processor_class)
 
         has_remote_code = image_processor_auto_map is not None
@@ -395,10 +442,19 @@ class AutoImageProcessor:
             trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
         )
 
+        if image_processor_auto_map is not None and not isinstance(image_processor_auto_map, tuple):
+            # In some configs, only the slow image processor class is stored
+            image_processor_auto_map = (image_processor_auto_map, None)
+
         if has_remote_code and trust_remote_code:
-            image_processor_class = get_class_from_dynamic_module(
-                image_processor_auto_map, pretrained_model_name_or_path, **kwargs
-            )
+            if not use_fast and image_processor_auto_map[1] is not None:
+                _warning_fast_image_processor_available(image_processor_auto_map[1])
+
+            if use_fast and image_processor_auto_map[1] is not None:
+                class_ref = image_processor_auto_map[1]
+            else:
+                class_ref = image_processor_auto_map[0]
+            image_processor_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
             _ = kwargs.pop("code_revision", None)
             if os.path.isdir(pretrained_model_name_or_path):
                 image_processor_class.register_for_auto_class()
@@ -407,8 +463,22 @@ class AutoImageProcessor:
             return image_processor_class.from_dict(config_dict, **kwargs)
         # Last try: we use the IMAGE_PROCESSOR_MAPPING.
         elif type(config) in IMAGE_PROCESSOR_MAPPING:
-            image_processor_class = IMAGE_PROCESSOR_MAPPING[type(config)]
-            return image_processor_class.from_dict(config_dict, **kwargs)
+            image_processor_tuple = IMAGE_PROCESSOR_MAPPING[type(config)]
+
+            image_processor_class_py, image_processor_class_fast = image_processor_tuple
+
+            if not use_fast and image_processor_class_fast is not None:
+                _warning_fast_image_processor_available(image_processor_class_fast)
+
+            if image_processor_class_fast and (use_fast or image_processor_class_py is None):
+                return image_processor_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+            else:
+                if image_processor_class_py is not None:
+                    return image_processor_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+                else:
+                    raise ValueError(
+                        "This image processor cannot be instantiated. Please make sure you have `Pillow` installed."
+                    )
 
         raise ValueError(
             f"Unrecognized image processor in {pretrained_model_name_or_path}. Should have a "
@@ -417,7 +487,13 @@ class AutoImageProcessor:
         )
 
     @staticmethod
-    def register(config_class, image_processor_class, exist_ok=False):
+    def register(
+        config_class,
+        image_processor_class=None,
+        slow_image_processor_class=None,
+        fast_image_processor_class=None,
+        exist_ok=False,
+    ):
         """
         Register a new image processor for this class.
 
@@ -426,4 +502,43 @@ class AutoImageProcessor:
                 The configuration corresponding to the model to register.
             image_processor_class ([`ImageProcessingMixin`]): The image processor to register.
         """
-        IMAGE_PROCESSOR_MAPPING.register(config_class, image_processor_class, exist_ok=exist_ok)
+        if image_processor_class is not None:
+            if slow_image_processor_class is not None:
+                raise ValueError("Cannot specify both image_processor_class and slow_image_processor_class")
+            warnings.warn(
+                "The image_processor_class argument is deprecated and will be removed in v4.42. Please use `slow_image_processor_class`, or `fast_image_processor_class` instead",
+                FutureWarning,
+            )
+            slow_image_processor_class = image_processor_class
+
+        if slow_image_processor_class is None and fast_image_processor_class is None:
+            raise ValueError("You need to specify either slow_image_processor_class or fast_image_processor_class")
+        if slow_image_processor_class is not None and issubclass(slow_image_processor_class, BaseImageProcessorFast):
+            raise ValueError("You passed a fast image processor in as the `slow_image_processor_class`.")
+        if fast_image_processor_class is not None and issubclass(fast_image_processor_class, BaseImageProcessor):
+            raise ValueError("You passed a slow image processor in as the `fast_image_processor_class`.")
+
+        if (
+            slow_image_processor_class is not None
+            and fast_image_processor_class is not None
+            and issubclass(fast_image_processor_class, BaseImageProcessorFast)
+            and fast_image_processor_class.slow_image_processor_class != slow_image_processor_class
+        ):
+            raise ValueError(
+                "The fast processor class you are passing has a `slow_image_processor_class` attribute that is not "
+                "consistent with the slow processor class you passed (fast tokenizer has "
+                f"{fast_image_processor_class.slow_image_processor_class} and you passed {slow_image_processor_class}. Fix one of those "
+                "so they match!"
+            )
+
+        # Avoid resetting a set slow/fast image processor if we are passing just the other ones.
+        if config_class in IMAGE_PROCESSOR_MAPPING._extra_content:
+            existing_slow, existing_fast = IMAGE_PROCESSOR_MAPPING[config_class]
+            if slow_image_processor_class is None:
+                slow_image_processor_class = existing_slow
+            if fast_image_processor_class is None:
+                fast_image_processor_class = existing_fast
+
+        IMAGE_PROCESSOR_MAPPING.register(
+            config_class, (slow_image_processor_class, fast_image_processor_class), exist_ok=exist_ok
+        )
diff --git a/src/transformers/models/vit/__init__.py b/src/transformers/models/vit/__init__.py
index db41e881fa..3066331278 100644
--- a/src/transformers/models/vit/__init__.py
+++ b/src/transformers/models/vit/__init__.py
@@ -19,6 +19,7 @@ from ...utils import (
     is_flax_available,
     is_tf_available,
     is_torch_available,
+    is_torchvision_available,
     is_vision_available,
 )
 
@@ -34,6 +35,15 @@ else:
     _import_structure["feature_extraction_vit"] = ["ViTFeatureExtractor"]
     _import_structure["image_processing_vit"] = ["ViTImageProcessor"]
 
+
+try:
+    if not is_torchvision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_vit_fast"] = ["ViTImageProcessorFast"]
+
 try:
     if not is_torch_available():
         raise OptionalDependencyNotAvailable()
@@ -83,6 +93,14 @@ if TYPE_CHECKING:
         from .feature_extraction_vit import ViTFeatureExtractor
         from .image_processing_vit import ViTImageProcessor
 
+    try:
+        if not is_torchvision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_vit_fast import ViTImageProcessorFast
+
     try:
         if not is_torch_available():
             raise OptionalDependencyNotAvailable()
diff --git a/src/transformers/models/vit/image_processing_vit_fast.py b/src/transformers/models/vit/image_processing_vit_fast.py
new file mode 100644
index 0000000000..0911376165
--- /dev/null
+++ b/src/transformers/models/vit/image_processing_vit_fast.py
@@ -0,0 +1,289 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for ViT."""
+
+import functools
+from typing import Dict, List, Optional, Union
+
+from ...image_processing_base import BatchFeature
+from ...image_processing_utils import get_size_dict
+from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict
+from ...image_transforms import FusedRescaleNormalize, NumpyToTensor, Rescale
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    ImageType,
+    PILImageResampling,
+    get_image_type,
+    make_list_of_images,
+    pil_torch_interpolation_mapping,
+)
+from ...utils import TensorType, logging
+from ...utils.import_utils import is_torch_available, is_torchvision_available
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_torch_available():
+    import torch
+
+
+if is_torchvision_available():
+    from torchvision.transforms import Compose, Normalize, PILToTensor, Resize
+
+
+class ViTImageProcessorFast(BaseImageProcessorFast):
+    r"""
+    Constructs a ViT image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
+            size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+    _transform_params = [
+        "do_resize",
+        "do_rescale",
+        "do_normalize",
+        "size",
+        "resample",
+        "rescale_factor",
+        "image_mean",
+        "image_std",
+        "image_type",
+    ]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 224, "width": 224}
+        size = get_size_dict(size)
+        self.do_resize = do_resize
+        self.do_rescale = do_rescale
+        self.do_normalize = do_normalize
+        self.size = size
+        self.resample = resample
+        self.rescale_factor = rescale_factor
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self._transform_settings = {}
+
+    def _build_transforms(
+        self,
+        do_resize: bool,
+        size: Dict[str, int],
+        resample: PILImageResampling,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Union[float, List[float]],
+        image_std: Union[float, List[float]],
+        image_type: ImageType,
+    ) -> "Compose":
+        """
+        Given the input settings build the image transforms using `torchvision.transforms.Compose`.
+        """
+        transforms = []
+
+        # All PIL and numpy values need to be converted to a torch tensor
+        # to keep cross compatibility with slow image processors
+        if image_type == ImageType.PIL:
+            transforms.append(PILToTensor())
+
+        elif image_type == ImageType.NUMPY:
+            transforms.append(NumpyToTensor())
+
+        if do_resize:
+            transforms.append(
+                Resize((size["height"], size["width"]), interpolation=pil_torch_interpolation_mapping[resample])
+            )
+
+        # We can combine rescale and normalize into a single operation for speed
+        if do_rescale and do_normalize:
+            transforms.append(FusedRescaleNormalize(image_mean, image_std, rescale_factor=rescale_factor))
+        elif do_rescale:
+            transforms.append(Rescale(rescale_factor=rescale_factor))
+        elif do_normalize:
+            transforms.append(Normalize(image_mean, image_std))
+
+        return Compose(transforms)
+
+    @functools.lru_cache(maxsize=1)
+    def _validate_input_arguments(
+        self,
+        return_tensors: Union[str, TensorType],
+        do_resize: bool,
+        size: Dict[str, int],
+        resample: PILImageResampling,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Union[float, List[float]],
+        image_std: Union[float, List[float]],
+        data_format: Union[str, ChannelDimension],
+        image_type: ImageType,
+    ):
+        if return_tensors != "pt":
+            raise ValueError("Only returning PyTorch tensors is currently supported.")
+
+        if data_format != ChannelDimension.FIRST:
+            raise ValueError("Only channel first data format is currently supported.")
+
+        if do_resize and None in (size, resample):
+            raise ValueError("Size and resample must be specified if do_resize is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and None in (image_mean, image_std):
+            raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.")
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = "pt",
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ):
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
+                resizing.
+            resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
+                `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
+                an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use if `do_normalize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Only "pt" is supported
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. The following formats are currently supported:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        resample = resample if resample is not None else self.resample
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        size = size if size is not None else self.size
+        # Make hashable for cache
+        size = SizeDict(**size)
+        image_mean = tuple(image_mean) if isinstance(image_mean, list) else image_mean
+        image_std = tuple(image_std) if isinstance(image_std, list) else image_std
+
+        images = make_list_of_images(images)
+        image_type = get_image_type(images[0])
+
+        if image_type not in [ImageType.PIL, ImageType.TORCH, ImageType.NUMPY]:
+            raise ValueError(f"Unsupported input image type {image_type}")
+
+        self._validate_input_arguments(
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            return_tensors=return_tensors,
+            data_format=data_format,
+            image_type=image_type,
+        )
+
+        transforms = self.get_transforms(
+            do_resize=do_resize,
+            do_rescale=do_rescale,
+            do_normalize=do_normalize,
+            size=size,
+            resample=resample,
+            rescale_factor=rescale_factor,
+            image_mean=image_mean,
+            image_std=image_std,
+            image_type=image_type,
+        )
+        transformed_images = [transforms(image) for image in images]
+
+        data = {"pixel_values": torch.vstack(transformed_images)}
+        return BatchFeature(data, tensor_type=return_tensors)
diff --git a/src/transformers/utils/dummy_torchvision_objects.py b/src/transformers/utils/dummy_torchvision_objects.py
new file mode 100644
index 0000000000..1d532aeea2
--- /dev/null
+++ b/src/transformers/utils/dummy_torchvision_objects.py
@@ -0,0 +1,16 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class BaseImageProcessorFast(metaclass=DummyObject):
+    _backends = ["torchvision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torchvision"])
+
+
+class ViTImageProcessorFast(metaclass=DummyObject):
+    _backends = ["torchvision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torchvision"])
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index aae31e9e4d..a27dc02444 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -9,6 +9,13 @@ class ImageProcessingMixin(metaclass=DummyObject):
         requires_backends(self, ["vision"])
 
 
+class BaseImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class ImageFeatureExtractionMixin(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/tests/models/auto/test_image_processing_auto.py b/tests/models/auto/test_image_processing_auto.py
index 0fb22b6c2b..b571e7a860 100644
--- a/tests/models/auto/test_image_processing_auto.py
+++ b/tests/models/auto/test_image_processing_auto.py
@@ -27,8 +27,10 @@ from transformers import (
     AutoImageProcessor,
     CLIPConfig,
     CLIPImageProcessor,
+    ViTImageProcessor,
+    ViTImageProcessorFast,
 )
-from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER
+from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, require_torchvision, require_vision
 
 
 sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
@@ -133,6 +135,23 @@ class AutoImageProcessorTest(unittest.TestCase):
         ):
             _ = AutoImageProcessor.from_pretrained("hf-internal-testing/config-no-model")
 
+    @require_vision
+    @require_torchvision
+    def test_use_fast_selection(self):
+        checkpoint = "hf-internal-testing/tiny-random-vit"
+
+        # Slow image processor is selected by default
+        image_processor = AutoImageProcessor.from_pretrained(checkpoint)
+        self.assertIsInstance(image_processor, ViTImageProcessor)
+
+        # Fast image processor is selected when use_fast=True
+        image_processor = AutoImageProcessor.from_pretrained(checkpoint, use_fast=True)
+        self.assertIsInstance(image_processor, ViTImageProcessorFast)
+
+        # Slow image processor is selected when use_fast=False
+        image_processor = AutoImageProcessor.from_pretrained(checkpoint, use_fast=False)
+        self.assertIsInstance(image_processor, ViTImageProcessor)
+
     def test_from_pretrained_dynamic_image_processor(self):
         # If remote code is not set, we will time out when asking whether to load the model.
         with self.assertRaises(ValueError):
diff --git a/tests/models/beit/test_image_processing_beit.py b/tests/models/beit/test_image_processing_beit.py
index d23e54db0d..e91517b3db 100644
--- a/tests/models/beit/test_image_processing_beit.py
+++ b/tests/models/beit/test_image_processing_beit.py
@@ -121,6 +121,7 @@ class BeitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = BeitImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = BeitImageProcessingTester(self)
 
     @property
diff --git a/tests/models/blip/test_image_processing_blip.py b/tests/models/blip/test_image_processing_blip.py
index 1d7e7f12ee..905e1dad55 100644
--- a/tests/models/blip/test_image_processing_blip.py
+++ b/tests/models/blip/test_image_processing_blip.py
@@ -90,6 +90,7 @@ class BlipImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = BlipImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = BlipImageProcessingTester(self)
 
     @property
@@ -112,6 +113,7 @@ class BlipImageProcessingTestFourChannels(ImageProcessingTestMixin, unittest.Tes
     image_processing_class = BlipImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = BlipImageProcessingTester(self, num_channels=4)
         self.expected_encoded_image_num_channels = 3
 
diff --git a/tests/models/bridgetower/test_image_processing_bridgetower.py b/tests/models/bridgetower/test_image_processing_bridgetower.py
index f8837fdc96..1dc5419b77 100644
--- a/tests/models/bridgetower/test_image_processing_bridgetower.py
+++ b/tests/models/bridgetower/test_image_processing_bridgetower.py
@@ -136,6 +136,7 @@ class BridgeTowerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
     image_processing_class = BridgeTowerImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = BridgeTowerImageProcessingTester(self)
 
     @property
diff --git a/tests/models/chinese_clip/test_image_processing_chinese_clip.py b/tests/models/chinese_clip/test_image_processing_chinese_clip.py
index 7eea00f885..94e41e8eaa 100644
--- a/tests/models/chinese_clip/test_image_processing_chinese_clip.py
+++ b/tests/models/chinese_clip/test_image_processing_chinese_clip.py
@@ -98,6 +98,7 @@ class ChineseCLIPImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
     image_processing_class = ChineseCLIPImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = ChineseCLIPImageProcessingTester(self, do_center_crop=True)
 
     @property
@@ -135,6 +136,7 @@ class ChineseCLIPImageProcessingTestFourChannels(ImageProcessingTestMixin, unitt
     image_processing_class = ChineseCLIPImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = ChineseCLIPImageProcessingTester(self, num_channels=4, do_center_crop=True)
         self.expected_encoded_image_num_channels = 3
 
diff --git a/tests/models/clip/test_image_processing_clip.py b/tests/models/clip/test_image_processing_clip.py
index a35a23d8da..740399d13f 100644
--- a/tests/models/clip/test_image_processing_clip.py
+++ b/tests/models/clip/test_image_processing_clip.py
@@ -94,6 +94,7 @@ class CLIPImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = CLIPImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = CLIPImageProcessingTester(self)
 
     @property
diff --git a/tests/models/conditional_detr/test_image_processing_conditional_detr.py b/tests/models/conditional_detr/test_image_processing_conditional_detr.py
index 7bbee7e831..171ec2d44f 100644
--- a/tests/models/conditional_detr/test_image_processing_conditional_detr.py
+++ b/tests/models/conditional_detr/test_image_processing_conditional_detr.py
@@ -131,6 +131,7 @@ class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcess
     image_processing_class = ConditionalDetrImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = ConditionalDetrImageProcessingTester(self)
 
     @property
diff --git a/tests/models/convnext/test_image_processing_convnext.py b/tests/models/convnext/test_image_processing_convnext.py
index 0c33174180..d2eaae4534 100644
--- a/tests/models/convnext/test_image_processing_convnext.py
+++ b/tests/models/convnext/test_image_processing_convnext.py
@@ -87,6 +87,7 @@ class ConvNextImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = ConvNextImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = ConvNextImageProcessingTester(self)
 
     @property
diff --git a/tests/models/deformable_detr/test_image_processing_deformable_detr.py b/tests/models/deformable_detr/test_image_processing_deformable_detr.py
index 59ba5b59e3..51fbfc33f8 100644
--- a/tests/models/deformable_detr/test_image_processing_deformable_detr.py
+++ b/tests/models/deformable_detr/test_image_processing_deformable_detr.py
@@ -131,6 +131,7 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
     image_processing_class = DeformableDetrImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = DeformableDetrImageProcessingTester(self)
 
     @property
diff --git a/tests/models/deit/test_image_processing_deit.py b/tests/models/deit/test_image_processing_deit.py
index 21dc3d9e95..462ad56d6b 100644
--- a/tests/models/deit/test_image_processing_deit.py
+++ b/tests/models/deit/test_image_processing_deit.py
@@ -93,6 +93,7 @@ class DeiTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     test_cast_dtype = True
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = DeiTImageProcessingTester(self)
 
     @property
diff --git a/tests/models/detr/test_image_processing_detr.py b/tests/models/detr/test_image_processing_detr.py
index 7f9f18b9d4..fc6d565127 100644
--- a/tests/models/detr/test_image_processing_detr.py
+++ b/tests/models/detr/test_image_processing_detr.py
@@ -130,6 +130,7 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
     image_processing_class = DetrImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = DetrImageProcessingTester(self)
 
     @property
diff --git a/tests/models/donut/test_image_processing_donut.py b/tests/models/donut/test_image_processing_donut.py
index c1a2bd3b26..9d96eb8ede 100644
--- a/tests/models/donut/test_image_processing_donut.py
+++ b/tests/models/donut/test_image_processing_donut.py
@@ -99,6 +99,7 @@ class DonutImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = DonutImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = DonutImageProcessingTester(self)
 
     @property
diff --git a/tests/models/dpt/test_image_processing_dpt.py b/tests/models/dpt/test_image_processing_dpt.py
index 2cc72274c4..aa1b954a08 100644
--- a/tests/models/dpt/test_image_processing_dpt.py
+++ b/tests/models/dpt/test_image_processing_dpt.py
@@ -86,6 +86,7 @@ class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = DPTImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = DPTImageProcessingTester(self)
 
     @property
diff --git a/tests/models/efficientnet/test_image_processing_efficientnet.py b/tests/models/efficientnet/test_image_processing_efficientnet.py
index fd754d8eb9..28b701c5c9 100644
--- a/tests/models/efficientnet/test_image_processing_efficientnet.py
+++ b/tests/models/efficientnet/test_image_processing_efficientnet.py
@@ -86,6 +86,7 @@ class EfficientNetImageProcessorTest(ImageProcessingTestMixin, unittest.TestCase
     image_processing_class = EfficientNetImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = EfficientNetImageProcessorTester(self)
 
     @property
diff --git a/tests/models/flava/test_image_processing_flava.py b/tests/models/flava/test_image_processing_flava.py
index d89a1a6f6b..04457e51ac 100644
--- a/tests/models/flava/test_image_processing_flava.py
+++ b/tests/models/flava/test_image_processing_flava.py
@@ -175,6 +175,7 @@ class FlavaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     maxDiff = None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = FlavaImageProcessingTester(self)
 
     @property
diff --git a/tests/models/glpn/test_image_processing_glpn.py b/tests/models/glpn/test_image_processing_glpn.py
index f9cadb3313..abffb31a66 100644
--- a/tests/models/glpn/test_image_processing_glpn.py
+++ b/tests/models/glpn/test_image_processing_glpn.py
@@ -93,6 +93,7 @@ class GLPNImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = GLPNImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = GLPNImageProcessingTester(self)
 
     @property
diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
index 6d20a01981..68618fb256 100644
--- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py
+++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
@@ -146,6 +146,7 @@ class GroundingDinoImageProcessingTest(AnnotationFormatTestMixin, ImageProcessin
     image_processing_class = GroundingDinoImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = GroundingDinoImageProcessingTester(self)
 
     @property
diff --git a/tests/models/idefics/test_image_processing_idefics.py b/tests/models/idefics/test_image_processing_idefics.py
index de42a421cd..0273480333 100644
--- a/tests/models/idefics/test_image_processing_idefics.py
+++ b/tests/models/idefics/test_image_processing_idefics.py
@@ -127,6 +127,7 @@ class IdeficsImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = IdeficsImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = IdeficsImageProcessingTester(self)
 
     @property
diff --git a/tests/models/idefics2/test_image_processing_idefics2.py b/tests/models/idefics2/test_image_processing_idefics2.py
index 4b3af1f632..2e0d36e75c 100644
--- a/tests/models/idefics2/test_image_processing_idefics2.py
+++ b/tests/models/idefics2/test_image_processing_idefics2.py
@@ -185,6 +185,7 @@ class Idefics2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = Idefics2ImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = Idefics2ImageProcessingTester(self)
 
     @property
diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py
index 4596d742a2..a9dbc636ef 100644
--- a/tests/models/imagegpt/test_image_processing_imagegpt.py
+++ b/tests/models/imagegpt/test_image_processing_imagegpt.py
@@ -22,7 +22,8 @@ import unittest
 import numpy as np
 from datasets import load_dataset
 
-from transformers.testing_utils import require_torch, require_vision, slow
+from transformers import AutoImageProcessor
+from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_vision, slow
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
@@ -96,6 +97,7 @@ class ImageGPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = ImageGPTImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = ImageGPTImageProcessingTester(self)
 
     @property
@@ -141,18 +143,38 @@ class ImageGPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
                 self.assertEqual(image_processor_first[key], value)
 
     def test_image_processor_from_and_save_pretrained(self):
-        image_processor_first = self.image_processing_class(**self.image_processor_dict)
+        for image_processing_class in self.image_processor_list:
+            image_processor_first = self.image_processing_class(**self.image_processor_dict)
 
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            image_processor_first.save_pretrained(tmpdirname)
-            image_processor_second = self.image_processing_class.from_pretrained(tmpdirname).to_dict()
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                image_processor_first.save_pretrained(tmpdirname)
+                image_processor_second = self.image_processing_class.from_pretrained(tmpdirname).to_dict()
 
-        image_processor_first = image_processor_first.to_dict()
-        for key, value in image_processor_first.items():
-            if key == "clusters":
-                self.assertTrue(np.array_equal(value, image_processor_second[key]))
-            else:
-                self.assertEqual(image_processor_first[key], value)
+            image_processor_first = image_processor_first.to_dict()
+            for key, value in image_processor_first.items():
+                if key == "clusters":
+                    self.assertTrue(np.array_equal(value, image_processor_second[key]))
+                else:
+                    self.assertEqual(image_processor_first[key], value)
+
+    def test_image_processor_save_load_with_autoimageprocessor(self):
+        for image_processing_class in self.image_processor_list:
+            image_processor_first = image_processing_class(**self.image_processor_dict)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                saved_file = image_processor_first.save_pretrained(tmpdirname)[0]
+                check_json_file_has_correct_format(saved_file)
+
+                image_processor_second = AutoImageProcessor.from_pretrained(tmpdirname)
+
+            image_processor_first = image_processor_first.to_dict()
+            image_processor_second = image_processor_second.to_dict()
+
+            for key, value in image_processor_first.items():
+                if key == "clusters":
+                    self.assertTrue(np.array_equal(value, image_processor_second[key]))
+                else:
+                    self.assertEqual(image_processor_first[key], value)
 
     @unittest.skip("ImageGPT requires clusters at initialization")
     def test_init_without_params(self):
diff --git a/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py b/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py
index eebb7420be..4413c8d756 100644
--- a/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py
@@ -76,6 +76,7 @@ class LayoutLMv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
     image_processing_class = LayoutLMv2ImageProcessor if is_pytesseract_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = LayoutLMv2ImageProcessingTester(self)
 
     @property
diff --git a/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py b/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py
index 8d4b64c2cc..a12fb6af0d 100644
--- a/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py
@@ -76,6 +76,7 @@ class LayoutLMv3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
     image_processing_class = LayoutLMv3ImageProcessor if is_pytesseract_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = LayoutLMv3ImageProcessingTester(self)
 
     @property
diff --git a/tests/models/levit/test_image_processing_levit.py b/tests/models/levit/test_image_processing_levit.py
index 756993c6b6..8827076290 100644
--- a/tests/models/levit/test_image_processing_levit.py
+++ b/tests/models/levit/test_image_processing_levit.py
@@ -91,6 +91,7 @@ class LevitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = LevitImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = LevitImageProcessingTester(self)
 
     @property
diff --git a/tests/models/llava_next/test_image_processor_llava_next.py b/tests/models/llava_next/test_image_processor_llava_next.py
index 8b1f98bbca..ff5c9e9708 100644
--- a/tests/models/llava_next/test_image_processor_llava_next.py
+++ b/tests/models/llava_next/test_image_processor_llava_next.py
@@ -105,6 +105,7 @@ class LlavaNextImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
 
     # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->LlavaNext
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = LlavaNextImageProcessingTester(self)
 
     @property
diff --git a/tests/models/mask2former/test_image_processing_mask2former.py b/tests/models/mask2former/test_image_processing_mask2former.py
index 9e7045c480..ae0fff8906 100644
--- a/tests/models/mask2former/test_image_processing_mask2former.py
+++ b/tests/models/mask2former/test_image_processing_mask2former.py
@@ -149,6 +149,7 @@ class Mask2FormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
     image_processing_class = Mask2FormerImageProcessor if (is_vision_available() and is_torch_available()) else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = Mask2FormerImageProcessingTester(self)
 
     @property
diff --git a/tests/models/maskformer/test_image_processing_maskformer.py b/tests/models/maskformer/test_image_processing_maskformer.py
index fca6576595..5d30431f1f 100644
--- a/tests/models/maskformer/test_image_processing_maskformer.py
+++ b/tests/models/maskformer/test_image_processing_maskformer.py
@@ -149,6 +149,7 @@ class MaskFormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
     image_processing_class = MaskFormerImageProcessor if (is_vision_available() and is_torch_available()) else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = MaskFormerImageProcessingTester(self)
 
     @property
diff --git a/tests/models/mobilenet_v1/test_image_processing_mobilenet_v1.py b/tests/models/mobilenet_v1/test_image_processing_mobilenet_v1.py
index ce0ecba34c..c9d32b0bab 100644
--- a/tests/models/mobilenet_v1/test_image_processing_mobilenet_v1.py
+++ b/tests/models/mobilenet_v1/test_image_processing_mobilenet_v1.py
@@ -82,6 +82,7 @@ class MobileNetV1ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
     image_processing_class = MobileNetV1ImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = MobileNetV1ImageProcessingTester(self)
 
     @property
diff --git a/tests/models/mobilenet_v2/test_image_processing_mobilenet_v2.py b/tests/models/mobilenet_v2/test_image_processing_mobilenet_v2.py
index 4c94be4721..e9cdf4a435 100644
--- a/tests/models/mobilenet_v2/test_image_processing_mobilenet_v2.py
+++ b/tests/models/mobilenet_v2/test_image_processing_mobilenet_v2.py
@@ -82,6 +82,7 @@ class MobileNetV2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
     image_processing_class = MobileNetV2ImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = MobileNetV2ImageProcessingTester(self)
 
     @property
diff --git a/tests/models/mobilevit/test_image_processing_mobilevit.py b/tests/models/mobilevit/test_image_processing_mobilevit.py
index 92e1a55947..9895befc8f 100644
--- a/tests/models/mobilevit/test_image_processing_mobilevit.py
+++ b/tests/models/mobilevit/test_image_processing_mobilevit.py
@@ -112,6 +112,7 @@ class MobileViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = MobileViTImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = MobileViTImageProcessingTester(self)
 
     @property
diff --git a/tests/models/nougat/test_image_processing_nougat.py b/tests/models/nougat/test_image_processing_nougat.py
index fc61ecbc19..5ab2901d31 100644
--- a/tests/models/nougat/test_image_processing_nougat.py
+++ b/tests/models/nougat/test_image_processing_nougat.py
@@ -111,6 +111,7 @@ class NougatImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = NougatImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = NougatImageProcessingTester(self)
 
     @property
diff --git a/tests/models/oneformer/test_image_processing_oneformer.py b/tests/models/oneformer/test_image_processing_oneformer.py
index 245af190c9..e60cc31b30 100644
--- a/tests/models/oneformer/test_image_processing_oneformer.py
+++ b/tests/models/oneformer/test_image_processing_oneformer.py
@@ -159,6 +159,7 @@ class OneFormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = image_processing_class
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = OneFormerImageProcessorTester(self)
 
     @property
diff --git a/tests/models/owlv2/test_image_processor_owlv2.py b/tests/models/owlv2/test_image_processor_owlv2.py
index 87b96d0654..51814b6dd8 100644
--- a/tests/models/owlv2/test_image_processor_owlv2.py
+++ b/tests/models/owlv2/test_image_processor_owlv2.py
@@ -90,6 +90,7 @@ class Owlv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = Owlv2ImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = Owlv2ImageProcessingTester(self)
 
     @property
diff --git a/tests/models/owlvit/test_image_processing_owlvit.py b/tests/models/owlvit/test_image_processing_owlvit.py
index f4897c051e..4442b1a65a 100644
--- a/tests/models/owlvit/test_image_processing_owlvit.py
+++ b/tests/models/owlvit/test_image_processing_owlvit.py
@@ -92,6 +92,7 @@ class OwlViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = OwlViTImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = OwlViTImageProcessingTester(self)
 
     @property
diff --git a/tests/models/pix2struct/test_image_processing_pix2struct.py b/tests/models/pix2struct/test_image_processing_pix2struct.py
index f0b94c4cf5..09e1abd806 100644
--- a/tests/models/pix2struct/test_image_processing_pix2struct.py
+++ b/tests/models/pix2struct/test_image_processing_pix2struct.py
@@ -87,6 +87,7 @@ class Pix2StructImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
     image_processing_class = Pix2StructImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = Pix2StructImageProcessingTester(self)
 
     @property
@@ -288,6 +289,7 @@ class Pix2StructImageProcessingTestFourChannels(ImageProcessingTestMixin, unitte
     image_processing_class = Pix2StructImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = Pix2StructImageProcessingTester(self, num_channels=4)
         self.expected_encoded_image_num_channels = 3
 
diff --git a/tests/models/poolformer/test_image_processing_poolformer.py b/tests/models/poolformer/test_image_processing_poolformer.py
index 017a511c40..af4c2bcbb5 100644
--- a/tests/models/poolformer/test_image_processing_poolformer.py
+++ b/tests/models/poolformer/test_image_processing_poolformer.py
@@ -88,6 +88,7 @@ class PoolFormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
     image_processing_class = PoolFormerImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = PoolFormerImageProcessingTester(self)
 
     @property
diff --git a/tests/models/pvt/test_image_processing_pvt.py b/tests/models/pvt/test_image_processing_pvt.py
index d6b11313d8..d24421fc74 100644
--- a/tests/models/pvt/test_image_processing_pvt.py
+++ b/tests/models/pvt/test_image_processing_pvt.py
@@ -84,6 +84,7 @@ class PvtImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = PvtImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = PvtImageProcessingTester(self)
 
     @property
diff --git a/tests/models/segformer/test_image_processing_segformer.py b/tests/models/segformer/test_image_processing_segformer.py
index bee6a4a24b..988843b710 100644
--- a/tests/models/segformer/test_image_processing_segformer.py
+++ b/tests/models/segformer/test_image_processing_segformer.py
@@ -112,6 +112,7 @@ class SegformerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = SegformerImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = SegformerImageProcessingTester(self)
 
     @property
diff --git a/tests/models/seggpt/test_image_processing_seggpt.py b/tests/models/seggpt/test_image_processing_seggpt.py
index 04cefb70d0..f79b7ea443 100644
--- a/tests/models/seggpt/test_image_processing_seggpt.py
+++ b/tests/models/seggpt/test_image_processing_seggpt.py
@@ -114,6 +114,7 @@ class SegGptImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = SegGptImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = SegGptImageProcessingTester(self)
 
     @property
diff --git a/tests/models/siglip/test_image_processor_siglip.py b/tests/models/siglip/test_image_processor_siglip.py
index 5f43d6f08a..7dbd05070c 100644
--- a/tests/models/siglip/test_image_processor_siglip.py
+++ b/tests/models/siglip/test_image_processor_siglip.py
@@ -91,6 +91,7 @@ class SiglipImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = SiglipImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = SiglipImageProcessingTester(self)
 
     @property
diff --git a/tests/models/superpoint/test_image_processing_superpoint.py b/tests/models/superpoint/test_image_processing_superpoint.py
index 19406bc91a..90bbf82d1e 100644
--- a/tests/models/superpoint/test_image_processing_superpoint.py
+++ b/tests/models/superpoint/test_image_processing_superpoint.py
@@ -77,6 +77,7 @@ class SuperPointImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
     image_processing_class = SuperPointImageProcessor if is_vision_available() else None
 
     def setUp(self) -> None:
+        super().setUp()
         self.image_processor_tester = SuperPointImageProcessingTester(self)
 
     @property
diff --git a/tests/models/swin2sr/test_image_processing_swin2sr.py b/tests/models/swin2sr/test_image_processing_swin2sr.py
index 719ac79d09..732a7e9541 100644
--- a/tests/models/swin2sr/test_image_processing_swin2sr.py
+++ b/tests/models/swin2sr/test_image_processing_swin2sr.py
@@ -98,6 +98,7 @@ class Swin2SRImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = Swin2SRImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = Swin2SRImageProcessingTester(self)
 
     @property
diff --git a/tests/models/tvp/test_image_processing_tvp.py b/tests/models/tvp/test_image_processing_tvp.py
index 1c9a84beb8..7de45d4bee 100644
--- a/tests/models/tvp/test_image_processing_tvp.py
+++ b/tests/models/tvp/test_image_processing_tvp.py
@@ -127,6 +127,7 @@ class TvpImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = TvpImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = TvpImageProcessingTester(self)
 
     @property
diff --git a/tests/models/video_llava/test_image_processing_video_llava.py b/tests/models/video_llava/test_image_processing_video_llava.py
index 4b69022bae..808001d281 100644
--- a/tests/models/video_llava/test_image_processing_video_llava.py
+++ b/tests/models/video_llava/test_image_processing_video_llava.py
@@ -128,6 +128,7 @@ class VideoLlavaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
 
     # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->VideoLlava
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = VideoLlavaImageProcessingTester(self)
 
     @property
diff --git a/tests/models/videomae/test_image_processing_videomae.py b/tests/models/videomae/test_image_processing_videomae.py
index 4a6f0b93c4..319e39fcc2 100644
--- a/tests/models/videomae/test_image_processing_videomae.py
+++ b/tests/models/videomae/test_image_processing_videomae.py
@@ -99,6 +99,7 @@ class VideoMAEImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = VideoMAEImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = VideoMAEImageProcessingTester(self)
 
     @property
diff --git a/tests/models/vilt/test_image_processing_vilt.py b/tests/models/vilt/test_image_processing_vilt.py
index 607a8b929d..f68b2d2628 100644
--- a/tests/models/vilt/test_image_processing_vilt.py
+++ b/tests/models/vilt/test_image_processing_vilt.py
@@ -130,6 +130,7 @@ class ViltImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = ViltImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = ViltImageProcessingTester(self)
 
     @property
diff --git a/tests/models/vit/test_image_processing_vit.py b/tests/models/vit/test_image_processing_vit.py
index c1c22c0a80..1c376f55aa 100644
--- a/tests/models/vit/test_image_processing_vit.py
+++ b/tests/models/vit/test_image_processing_vit.py
@@ -84,6 +84,7 @@ class ViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = ViTImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = ViTImageProcessingTester(self)
 
     @property
@@ -91,16 +92,18 @@ class ViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
         return self.image_processor_tester.prepare_image_processor_dict()
 
     def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            self.assertTrue(hasattr(image_processing, "image_mean"))
+            self.assertTrue(hasattr(image_processing, "image_std"))
+            self.assertTrue(hasattr(image_processing, "do_normalize"))
+            self.assertTrue(hasattr(image_processing, "do_resize"))
+            self.assertTrue(hasattr(image_processing, "size"))
 
     def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"height": 18, "width": 18})
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class.from_dict(self.image_processor_dict)
+            self.assertEqual(image_processor.size, {"height": 18, "width": 18})
 
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
-        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+            image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42)
+            self.assertEqual(image_processor.size, {"height": 42, "width": 42})
diff --git a/tests/models/vitmatte/test_image_processing_vitmatte.py b/tests/models/vitmatte/test_image_processing_vitmatte.py
index e86cfde1e5..8aebee3735 100644
--- a/tests/models/vitmatte/test_image_processing_vitmatte.py
+++ b/tests/models/vitmatte/test_image_processing_vitmatte.py
@@ -94,6 +94,7 @@ class VitMatteImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = VitMatteImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = VitMatteImageProcessingTester(self)
 
     @property
diff --git a/tests/models/vivit/test_image_processing_vivit.py b/tests/models/vivit/test_image_processing_vivit.py
index dad120ef81..0e8301f667 100644
--- a/tests/models/vivit/test_image_processing_vivit.py
+++ b/tests/models/vivit/test_image_processing_vivit.py
@@ -99,6 +99,7 @@ class VivitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = VivitImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = VivitImageProcessingTester(self)
 
     @property
diff --git a/tests/models/yolos/test_image_processing_yolos.py b/tests/models/yolos/test_image_processing_yolos.py
index f04015ac0c..a94cd8b883 100644
--- a/tests/models/yolos/test_image_processing_yolos.py
+++ b/tests/models/yolos/test_image_processing_yolos.py
@@ -143,6 +143,7 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
     image_processing_class = YolosImageProcessor if is_vision_available() else None
 
     def setUp(self):
+        super().setUp()
         self.image_processor_tester = YolosImageProcessingTester(self)
 
     @property
diff --git a/tests/test_image_processing_common.py b/tests/test_image_processing_common.py
index 90c1a4e7e1..e9b9467f58 100644
--- a/tests/test_image_processing_common.py
+++ b/tests/test_image_processing_common.py
@@ -19,7 +19,9 @@ import os
 import pathlib
 import tempfile
 
-from transformers import BatchFeature
+import requests
+
+from transformers import AutoImageProcessor, BatchFeature
 from transformers.image_utils import AnnotationFormat, AnnotionFormat
 from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available
@@ -129,176 +131,263 @@ def prepare_video_inputs(
 
 class ImageProcessingTestMixin:
     test_cast_dtype = None
+    image_processing_class = None
+    fast_image_processing_class = None
+    image_processors_list = None
+    test_slow_image_processor = True
+    test_fast_image_processor = True
+
+    def setUp(self):
+        image_processor_list = []
+
+        if self.test_slow_image_processor and self.image_processing_class:
+            image_processor_list.append(self.image_processing_class)
+
+        if self.test_fast_image_processor and self.fast_image_processing_class:
+            image_processor_list.append(self.fast_image_processing_class)
+
+        self.image_processor_list = image_processor_list
+
+    @require_vision
+    @require_torch
+    def test_slow_fast_equivalence(self):
+        dummy_image = Image.open(
+            requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw
+        )
+
+        if not self.test_slow_image_processor or not self.test_fast_image_processor:
+            self.skipTest("Skipping slow/fast equivalence test")
+
+        if self.image_processing_class is None or self.fast_image_processing_class is None:
+            self.skipTest("Skipping slow/fast equivalence test as one of the image processors is not defined")
+
+        image_processor_slow = self.image_processing_class(**self.image_processor_dict)
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        encoding_slow = image_processor_slow(dummy_image, return_tensors="pt")
+        encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
+
+        self.assertTrue(torch.allclose(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-3))
+
+    @require_vision
+    @require_torch
+    def test_fast_is_faster_than_slow(self):
+        import time
+
+        def measure_time(self, image_processor, dummy_image):
+            start = time.time()
+            _ = image_processor(dummy_image, return_tensors="pt")
+            return time.time() - start
+
+        dummy_image = Image.open(
+            requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw
+        )
+
+        if not self.test_slow_image_processor or not self.test_fast_image_processor:
+            self.skipTest("Skipping speed test")
+
+        if self.image_processing_class is None or self.fast_image_processing_class is None:
+            self.skipTest("Skipping speed test as one of the image processors is not defined")
+
+        image_processor_slow = self.image_processing_class(**self.image_processor_dict)
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        slow_time = self.measure_time(image_processor_slow, dummy_image)
+        fast_time = self.measure_time(image_processor_fast, dummy_image)
+
+        self.assertLessEqual(fast_time, slow_time)
 
     def test_image_processor_to_json_string(self):
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        obj = json.loads(image_processor.to_json_string())
-        for key, value in self.image_processor_dict.items():
-            self.assertEqual(obj[key], value)
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class(**self.image_processor_dict)
+            obj = json.loads(image_processor.to_json_string())
+            for key, value in self.image_processor_dict.items():
+                self.assertEqual(obj[key], value)
 
     def test_image_processor_to_json_file(self):
-        image_processor_first = self.image_processing_class(**self.image_processor_dict)
+        for image_processing_class in self.image_processor_list:
+            image_processor_first = image_processing_class(**self.image_processor_dict)
 
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            json_file_path = os.path.join(tmpdirname, "image_processor.json")
-            image_processor_first.to_json_file(json_file_path)
-            image_processor_second = self.image_processing_class.from_json_file(json_file_path)
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                json_file_path = os.path.join(tmpdirname, "image_processor.json")
+                image_processor_first.to_json_file(json_file_path)
+                image_processor_second = image_processing_class.from_json_file(json_file_path)
 
-        self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
+            self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
 
     def test_image_processor_from_and_save_pretrained(self):
-        image_processor_first = self.image_processing_class(**self.image_processor_dict)
+        for image_processing_class in self.image_processor_list:
+            image_processor_first = image_processing_class(**self.image_processor_dict)
 
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            saved_file = image_processor_first.save_pretrained(tmpdirname)[0]
-            check_json_file_has_correct_format(saved_file)
-            image_processor_second = self.image_processing_class.from_pretrained(tmpdirname)
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                saved_file = image_processor_first.save_pretrained(tmpdirname)[0]
+                check_json_file_has_correct_format(saved_file)
+                image_processor_second = image_processing_class.from_pretrained(tmpdirname)
 
-        self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
+            self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
+
+    def test_image_processor_save_load_with_autoimageprocessor(self):
+        for image_processing_class in self.image_processor_list:
+            image_processor_first = image_processing_class(**self.image_processor_dict)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                saved_file = image_processor_first.save_pretrained(tmpdirname)[0]
+                check_json_file_has_correct_format(saved_file)
+
+                image_processor_second = AutoImageProcessor.from_pretrained(tmpdirname)
+
+            self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
 
     def test_init_without_params(self):
-        image_processor = self.image_processing_class()
-        self.assertIsNotNone(image_processor)
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class()
+            self.assertIsNotNone(image_processor)
 
     @require_torch
     @require_vision
     def test_cast_dtype_device(self):
-        if self.test_cast_dtype is not None:
-            # Initialize image_processor
-            image_processor = self.image_processing_class(**self.image_processor_dict)
+        for image_processing_class in self.image_processor_list:
+            if self.test_cast_dtype is not None:
+                # Initialize image_processor
+                image_processor = image_processing_class(**self.image_processor_dict)
 
+                # create random PyTorch tensors
+                image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+
+                encoding = image_processor(image_inputs, return_tensors="pt")
+                # for layoutLM compatiblity
+                self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
+                self.assertEqual(encoding.pixel_values.dtype, torch.float32)
+
+                encoding = image_processor(image_inputs, return_tensors="pt").to(torch.float16)
+                self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
+                self.assertEqual(encoding.pixel_values.dtype, torch.float16)
+
+                encoding = image_processor(image_inputs, return_tensors="pt").to("cpu", torch.bfloat16)
+                self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
+                self.assertEqual(encoding.pixel_values.dtype, torch.bfloat16)
+
+                with self.assertRaises(TypeError):
+                    _ = image_processor(image_inputs, return_tensors="pt").to(torch.bfloat16, "cpu")
+
+                # Try with text + image feature
+                encoding = image_processor(image_inputs, return_tensors="pt")
+                encoding.update({"input_ids": torch.LongTensor([[1, 2, 3], [4, 5, 6]])})
+                encoding = encoding.to(torch.float16)
+
+                self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
+                self.assertEqual(encoding.pixel_values.dtype, torch.float16)
+                self.assertEqual(encoding.input_ids.dtype, torch.long)
+
+    def test_call_pil(self):
+        for image_processing_class in self.image_processor_list:
+            # Initialize image_processing
+            image_processing = image_processing_class(**self.image_processor_dict)
+            # create random PIL images
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+            for image in image_inputs:
+                self.assertIsInstance(image, Image.Image)
+
+            # Test not batched input
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+            self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+            # Test batched
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+            self.assertEqual(
+                tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
+            )
+
+    def test_call_numpy(self):
+        for image_processing_class in self.image_processor_list:
+            # Initialize image_processing
+            image_processing = image_processing_class(**self.image_processor_dict)
+            # create random numpy tensors
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+            for image in image_inputs:
+                self.assertIsInstance(image, np.ndarray)
+
+            # Test not batched input
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+            self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+            # Test batched
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+            self.assertEqual(
+                tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
+            )
+
+    def test_call_pytorch(self):
+        for image_processing_class in self.image_processor_list:
+            # Initialize image_processing
+            image_processing = image_processing_class(**self.image_processor_dict)
             # create random PyTorch tensors
             image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
 
-            encoding = image_processor(image_inputs, return_tensors="pt")
-            # for layoutLM compatiblity
-            self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
-            self.assertEqual(encoding.pixel_values.dtype, torch.float32)
+            for image in image_inputs:
+                self.assertIsInstance(image, torch.Tensor)
 
-            encoding = image_processor(image_inputs, return_tensors="pt").to(torch.float16)
-            self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
-            self.assertEqual(encoding.pixel_values.dtype, torch.float16)
+            # Test not batched input
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+            self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
 
-            encoding = image_processor(image_inputs, return_tensors="pt").to("cpu", torch.bfloat16)
-            self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
-            self.assertEqual(encoding.pixel_values.dtype, torch.bfloat16)
-
-            with self.assertRaises(TypeError):
-                _ = image_processor(image_inputs, return_tensors="pt").to(torch.bfloat16, "cpu")
-
-            # Try with text + image feature
-            encoding = image_processor(image_inputs, return_tensors="pt")
-            encoding.update({"input_ids": torch.LongTensor([[1, 2, 3], [4, 5, 6]])})
-            encoding = encoding.to(torch.float16)
-
-            self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
-            self.assertEqual(encoding.pixel_values.dtype, torch.float16)
-            self.assertEqual(encoding.input_ids.dtype, torch.long)
-
-    def test_call_pil(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PIL images
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
-        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
-
-        # Test batched
-        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
-        self.assertEqual(
-            tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
-        )
-
-    def test_call_numpy(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
-        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
-
-        # Test batched
-        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
-        self.assertEqual(
-            tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
-        )
-
-    def test_call_pytorch(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PyTorch tensors
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
-
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
-        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
-
-        # Test batched
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
-        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            tuple(encoded_images.shape),
-            (self.image_processor_tester.batch_size, *expected_output_image_shape),
-        )
+            # Test batched
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            self.assertEqual(
+                tuple(encoded_images.shape),
+                (self.image_processor_tester.batch_size, *expected_output_image_shape),
+            )
 
     def test_call_numpy_4_channels(self):
-        # Test that can process images which have an arbitrary number of channels
-        # Initialize image_processing
-        image_processor = self.image_processing_class(**self.image_processor_dict)
+        for image_processing_class in self.image_processor_list:
+            # Test that can process images which have an arbitrary number of channels
+            # Initialize image_processing
+            image_processor = image_processing_class(**self.image_processor_dict)
 
-        # create random numpy tensors
-        self.image_processor_tester.num_channels = 4
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+            # create random numpy tensors
+            self.image_processor_tester.num_channels = 4
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
 
-        # Test not batched input
-        encoded_images = image_processor(
-            image_inputs[0],
-            return_tensors="pt",
-            input_data_format="channels_first",
-            image_mean=0,
-            image_std=1,
-        ).pixel_values
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
-        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+            # Test not batched input
+            encoded_images = image_processor(
+                image_inputs[0],
+                return_tensors="pt",
+                input_data_format="channels_first",
+                image_mean=0,
+                image_std=1,
+            ).pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+            self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
 
-        # Test batched
-        encoded_images = image_processor(
-            image_inputs,
-            return_tensors="pt",
-            input_data_format="channels_first",
-            image_mean=0,
-            image_std=1,
-        ).pixel_values
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
-        self.assertEqual(
-            tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
-        )
+            # Test batched
+            encoded_images = image_processor(
+                image_inputs,
+                return_tensors="pt",
+                input_data_format="channels_first",
+                image_mean=0,
+                image_std=1,
+            ).pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+            self.assertEqual(
+                tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
+            )
 
     def test_image_processor_preprocess_arguments(self):
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        if hasattr(image_processor, "_valid_processor_keys") and hasattr(image_processor, "preprocess"):
-            preprocess_parameter_names = inspect.getfullargspec(image_processor.preprocess).args
-            preprocess_parameter_names.remove("self")
-            preprocess_parameter_names.sort()
-            valid_processor_keys = image_processor._valid_processor_keys
-            valid_processor_keys.sort()
-            self.assertEqual(preprocess_parameter_names, valid_processor_keys)
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class(**self.image_processor_dict)
+            if hasattr(image_processor, "_valid_processor_keys") and hasattr(image_processor, "preprocess"):
+                preprocess_parameter_names = inspect.getfullargspec(image_processor.preprocess).args
+                preprocess_parameter_names.remove("self")
+                preprocess_parameter_names.sort()
+                valid_processor_keys = image_processor._valid_processor_keys
+                valid_processor_keys.sort()
+                self.assertEqual(preprocess_parameter_names, valid_processor_keys)
 
 
 class AnnotationFormatTestMixin: