Add fast image processor Janus, Deepseek VL, Deepseek VL hybrid (#39739)

* add fast image processor Janus, deepseek_vl, deepseek_vl_hybrid * fix after review
2025-08-01 12:20:08 -04:00
parent 88ead3f518
commit 7b4d9843ba
19 changed files with 1268 additions and 97 deletions
--- a/docs/source/en/model_doc/deepseek_vl.md
+++ b/docs/source/en/model_doc/deepseek_vl.md
@@ -209,6 +209,10 @@ model = DeepseekVLForConditionalGeneration.from_pretrained(

 [[autodoc]] DeepseekVLImageProcessor

+## DeepseekVLImageProcessorFast
+
+[[autodoc]] DeepseekVLImageProcessorFast
+
 ## DeepseekVLModel

 [[autodoc]] DeepseekVLModel
--- a/docs/source/en/model_doc/deepseek_vl_hybrid.md
+++ b/docs/source/en/model_doc/deepseek_vl_hybrid.md
@@ -208,6 +208,10 @@ model = DeepseekVLHybridForConditionalGeneration.from_pretrained(

 [[autodoc]] DeepseekVLHybridImageProcessor

+## DeepseekVLHybridImageProcessorFast
+
+[[autodoc]] DeepseekVLHybridImageProcessorFast
+
 ## DeepseekVLHybridModel

 [[autodoc]] DeepseekVLHybridModel
--- a/docs/source/en/model_doc/janus.md
+++ b/docs/source/en/model_doc/janus.md
@@ -209,6 +209,10 @@ for i, image in enumerate(images['pixel_values']):

 [[autodoc]] JanusImageProcessor

+## JanusImageProcessorFast
+
+[[autodoc]] JanusImageProcessorFast
+
 ## JanusVisionModel

 [[autodoc]] JanusVisionModel
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -78,8 +78,8 @@ else:
            ("convnextv2", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
            ("cvt", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
            ("data2vec-vision", ("BeitImageProcessor", "BeitImageProcessorFast")),
-            ("deepseek_vl", ("DeepseekVLImageProcessor")),
-            ("deepseek_vl_hybrid", ("DeepseekVLHybridImageProcessor")),
+            ("deepseek_vl", ("DeepseekVLImageProcessor", "DeepseekVLImageProcessorFast")),
+            ("deepseek_vl_hybrid", ("DeepseekVLHybridImageProcessor", "DeepseekVLHybridImageProcessorFast")),
            ("deformable_detr", ("DeformableDetrImageProcessor", "DeformableDetrImageProcessorFast")),
            ("deit", ("DeiTImageProcessor", "DeiTImageProcessorFast")),
            ("depth_anything", ("DPTImageProcessor", "DPTImageProcessorFast")),
@@ -113,7 +113,7 @@ else:
            ("imagegpt", ("ImageGPTImageProcessor",)),
            ("instructblip", ("BlipImageProcessor", "BlipImageProcessorFast")),
            ("instructblipvideo", ("InstructBlipVideoImageProcessor",)),
-            ("janus", ("JanusImageProcessor")),
+            ("janus", ("JanusImageProcessor", "JanusImageProcessorFast")),
            ("kosmos-2", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
            ("layoutlmv2", ("LayoutLMv2ImageProcessor", "LayoutLMv2ImageProcessorFast")),
            ("layoutlmv3", ("LayoutLMv3ImageProcessor", "LayoutLMv3ImageProcessorFast")),
--- a/src/transformers/models/deepseek_vl/configuration_deepseek_vl.py
+++ b/src/transformers/models/deepseek_vl/configuration_deepseek_vl.py
@@ -20,7 +20,9 @@


 from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ...utils import (
+    logging,
+)
 from ..auto import CONFIG_MAPPING, AutoConfig


--- a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py
+++ b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py
@@ -131,6 +131,7 @@ class DeepseekVLImageProcessor(BaseImageProcessor):
        self,
        image: np.ndarray,
        size: Union[dict[str, int], int],
+        background_color: Optional[tuple[int, int, int]] = None,
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -142,6 +143,10 @@ class DeepseekVLImageProcessor(BaseImageProcessor):
        Args:
            image (`np.ndarray`):
                Image to resize.
+            size (`dict[str, int]` or `int`):
+                The size to resize the image to. If a dictionary, it should have the keys `"height"` and `"width"`.
+            background_color (`tuple[int, int, int]`):
+                The background color to use for the padding.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
            data_format (`ChannelDimension` or `str`, *optional*):
@@ -160,6 +165,7 @@ class DeepseekVLImageProcessor(BaseImageProcessor):
        Returns:
            `np.ndarray`: The resized image.
        """
+        background_color = background_color if background_color is not None else self.background_color
        if input_data_format is None:
            input_data_format = infer_channel_dimension_format(image)

@@ -191,7 +197,7 @@ class DeepseekVLImageProcessor(BaseImageProcessor):
        # Expand and pad the images to obtain a square image of dimensions `size x size`
        image = self.pad_to_square(
            image=image,
-            background_color=self.background_color,
+            background_color=background_color,
            input_data_format=input_data_format,
        )
        return image
@@ -406,9 +412,5 @@ class DeepseekVLImageProcessor(BaseImageProcessor):

        return result

-    def postprocess(self):
-        """Applies post-processing to the decoded image tokens by reversing transformations applied during preprocessing."""
-        raise AttributeError("Not needed for DeepseekVL")
-

 __all__ = ["DeepseekVLImageProcessor"]
--- a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py
+++ b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py
@@ -0,0 +1,199 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/deepseek_vl/modular_deepseek_vl.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_deepseek_vl.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+import torch.nn.functional as F
+
+from ...image_processing_utils import BatchFeature
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling, SizeDict
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+    is_torch_available,
+)
+
+
+if is_torch_available():
+    import torch
+
+
+class DeepseekVLFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    r"""
+    min_size (`int`, *optional*, defaults to 14):
+        The minimum allowed size for the resized image. Ensures that neither the height nor width
+        falls below this value after resizing.
+    """
+
+    min_size: int
+
+
+@auto_docstring
+class DeepseekVLImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BICUBIC
+    image_mean = OPENAI_CLIP_MEAN
+    image_std = OPENAI_CLIP_STD
+    size = {"height": 384, "width": 384}
+    min_size = 14
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    valid_kwargs = DeepseekVLFastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[DeepseekVLFastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+        if kwargs.get("image_mean", None) is None:
+            background_color = (127, 127, 127)
+        else:
+            background_color = tuple([int(x * 255) for x in kwargs.get("image_mean")])
+        self.background_color = tuple(background_color)
+
+    def resize(
+        self,
+        image: "torch.Tensor",
+        size: SizeDict,
+        min_size: int,
+        interpolation: "F.InterpolationMode" = None,
+        antialias: bool = True,
+        **kwargs,
+    ) -> "torch.Tensor":
+        if size.height is None or size.width is None or size.height != size.width:
+            raise ValueError(
+                f"Output height and width must be the same. Got height={size['height']} and width={size['width']}"
+            )
+        size = size.height
+
+        height, width = image.shape[-2:]
+        max_size = max(height, width)
+
+        delta = size / max_size
+        # Largest side becomes `size` and the other side is scaled according to the aspect ratio.
+        output_size_nonpadded = SizeDict(
+            height=max(int(height * delta), min_size),
+            width=max(int(width * delta), min_size),
+        )
+
+        return super().resize(image, size=output_size_nonpadded, interpolation=interpolation, antialias=antialias)
+
+    def pad_to_square(
+        self,
+        images: "torch.Tensor",
+        background_color: Union[int, tuple[int, int, int]] = 0,
+    ) -> "torch.Tensor":
+        """
+        Pads an image to a square based on the longest edge.
+
+        Args:
+            images (`torch.Tensor`):
+                The images to pad.
+            background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
+                The color to use for the padding. Can be an integer for single channel or a
+                tuple of integers representing for multi-channel images. If passed as integer
+                in mutli-channel mode, it will default to `0` in subsequent channels.
+
+        Returns:
+            `torch.Tensor`: The padded images.
+        """
+        height, width = images.shape[-2:]
+        num_channels = images.shape[1]
+        batch_size = images.shape[0]
+
+        if height == width:
+            return images
+
+        max_dim = max(height, width)
+
+        # Ensure background_color is the correct shape
+        if isinstance(background_color, int):
+            background_color = [background_color]
+        elif len(background_color) != num_channels:
+            raise ValueError(
+                f"background_color must have no more than {num_channels} elements to match the number of channels"
+            )
+
+        padded_images = torch.zeros(
+            (batch_size, num_channels, max_dim, max_dim), dtype=images.dtype, device=images.device
+        )
+        for i, color in enumerate(background_color):
+            padded_images[:, i, :, :] = color
+        if width > height:
+            start = (max_dim - height) // 2
+            padded_images[:, :, start : start + height, :] = images
+        else:
+            start = (max_dim - width) // 2
+            padded_images[:, :, :, start : start + width] = images
+
+        return padded_images
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        min_size: int,
+        interpolation: Optional["F.InterpolationMode"],
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        do_pad: bool = True,
+        **kwargs,
+    ) -> BatchFeature:
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(
+                    image=stacked_images, size=size, min_size=min_size, interpolation=interpolation
+                )
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_pad:
+                stacked_images = self.pad_to_square(stacked_images, background_color=self.background_color)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+
+        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+
+
+__all__ = ["DeepseekVLImageProcessorFast"]
--- a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py
+++ b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py
@@ -33,6 +33,7 @@ from ...utils import (
 from ..auto import CONFIG_MAPPING, AutoConfig, AutoModel
 from ..idefics.modeling_idefics import IdeficsBaseModelOutputWithPast, IdeficsCausalLMOutputWithPast
 from ..janus.image_processing_janus import JanusImageProcessor
+from ..janus.image_processing_janus_fast import JanusImageProcessorFast
 from ..janus.modeling_janus import JanusForConditionalGeneration, JanusModel, JanusPreTrainedModel


@@ -181,6 +182,9 @@ class DeepseekVLForConditionalGeneration(JanusForConditionalGeneration):


 class DeepseekVLImageProcessor(JanusImageProcessor):
+    def __init__(self, **super_kwargs):
+        super().__init__(**super_kwargs)
+
    def postprocess(self):
        raise AttributeError("Not needed for DeepseekVL")

@@ -188,6 +192,14 @@ class DeepseekVLImageProcessor(JanusImageProcessor):
        raise AttributeError("Not needed for DeepseekVL")


+class DeepseekVLImageProcessorFast(JanusImageProcessorFast):
+    def __init__(self, **super_kwargs):
+        super().__init__(**super_kwargs)
+
+    def postprocess(self):
+        raise AttributeError("Not needed for DeepseekVL")
+
+
 class DeepseekVLProcessorKwargs(ProcessingKwargs, total=False):
    _defaults = {
        "text_kwargs": {"padding": False},
@@ -322,5 +334,6 @@ __all__ = [
    "DeepseekVLModel",
    "DeepseekVLForConditionalGeneration",
    "DeepseekVLImageProcessor",
+    "DeepseekVLImageProcessorFast",
    "DeepseekVLProcessor",
 ]
--- a/src/transformers/models/deepseek_vl_hybrid/init.py
+++ b/src/transformers/models/deepseek_vl_hybrid/init.py
@@ -21,6 +21,7 @@ if TYPE_CHECKING:
    from .configuration_deepseek_vl_hybrid import *
    from .image_processing_deepseek_vl_fast_hybrid import *
    from .image_processing_deepseek_vl_hybrid import *
+    from .image_processing_deepseek_vl_hybrid_fast import *
    from .modeling_deepseek_vl_hybrid import *
    from .processing_deepseek_vl_hybrid import *
 else:
--- a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py
@@ -154,14 +154,15 @@ class DeepseekVLHybridImageProcessor(BaseImageProcessor):
            self.background_color = tuple([int(x * 255) for x in image_mean])

        if high_res_image_mean is None:
-            self.background_color = (127, 127, 127)
+            self.high_res_background_color = (127, 127, 127)
        else:
-            self.background_color = tuple([int(x * 255) for x in high_res_image_mean])
+            self.high_res_background_color = tuple([int(x * 255) for x in high_res_image_mean])

    def resize(
        self,
        image: np.ndarray,
        size: Union[dict[str, int], int],
+        background_color: Optional[tuple[int, int, int]] = None,
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -173,6 +174,10 @@ class DeepseekVLHybridImageProcessor(BaseImageProcessor):
        Args:
            image (`np.ndarray`):
                Image to resize.
+            size (`dict[str, int]` or `int`):
+                The size to resize the image to. If a dictionary, it should have the keys `"height"` and `"width"`.
+            background_color (`tuple[int, int, int]`):
+                The background color to use for the padding.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
            data_format (`ChannelDimension` or `str`, *optional*):
@@ -191,6 +196,7 @@ class DeepseekVLHybridImageProcessor(BaseImageProcessor):
        Returns:
            `np.ndarray`: The resized image.
        """
+        background_color = background_color if background_color is not None else self.background_color
        if input_data_format is None:
            input_data_format = infer_channel_dimension_format(image)

@@ -222,7 +228,7 @@ class DeepseekVLHybridImageProcessor(BaseImageProcessor):
        # Expand and pad the images to obtain a square image of dimensions `size x size`
        image = self.pad_to_square(
            image=image,
-            background_color=self.background_color,
+            background_color=background_color,
            input_data_format=input_data_format,
        )
        return image
@@ -361,16 +367,20 @@ class DeepseekVLHybridImageProcessor(BaseImageProcessor):
            # high_res_image: resize (high) -> rescale -> normalize (high)
            # low_res_image:  resize (high) -> rescale -> resize (low) -> normalize (low)
            high_res_image = image
-
            if do_resize:
                high_res_image = self.resize(
                    image=high_res_image,
                    size=high_res_size_dict,
+                    background_color=self.high_res_background_color,
                    resample=high_res_resample,
                    input_data_format=input_data_format,
                )
                image = self.resize(
-                    image=high_res_image, size=size_dict, resample=resample, input_data_format=input_data_format
+                    image=high_res_image,
+                    size=size_dict,
+                    background_color=self.background_color,
+                    resample=resample,
+                    input_data_format=input_data_format,
                )

            if do_rescale:
@@ -475,9 +485,5 @@ class DeepseekVLHybridImageProcessor(BaseImageProcessor):

        return result

-    def postprocess(self):
-        """Applies post-processing to the decoded image tokens by reversing transformations applied during preprocessing."""
-        raise AttributeError("Not needed for DeepseekVLHybrid")
-

 __all__ = ["DeepseekVLHybridImageProcessor"]
--- a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py
+++ b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py
@@ -0,0 +1,326 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_deepseek_vl_hybrid.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+import torch
+
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    BatchFeature,
+    DefaultFastImageProcessorKwargs,
+    get_size_dict,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ChannelDimension, PILImageResampling, SizeDict
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+)
+
+
+if is_torchvision_v2_available():
+    from torchvision.transforms.v2 import functional as F
+
+    from ...image_utils import pil_torch_interpolation_mapping
+elif is_torchvision_available():
+    from torchvision.transforms import functional as F
+
+    from ...image_utils import pil_torch_interpolation_mapping
+
+
+class DeepseekVLHybridFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    r"""
+    min_size (`int`, *optional*, defaults to 14):
+        The minimum allowed size for the resized image. Ensures that neither the height nor width
+        falls below this value after resizing.
+     high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
+        Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
+        method.
+    high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+        Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+        overridden by the `high_res_resample` parameter in the `preprocess` method.
+    high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
+        Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
+        channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
+    high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
+        Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
+        number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
+    """
+
+    min_size: int
+    high_res_size: dict
+    high_res_resample: "PILImageResampling"
+    high_res_image_mean: list[float]
+    high_res_image_std: list[float]
+
+
+@auto_docstring
+class DeepseekVLHybridImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BICUBIC
+    image_mean = OPENAI_CLIP_MEAN
+    image_std = OPENAI_CLIP_STD
+    size = {"height": 384, "width": 384}
+    min_size = 14
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    valid_kwargs = DeepseekVLHybridFastImageProcessorKwargs
+    high_res_image_mean = OPENAI_CLIP_MEAN
+    high_res_image_std = OPENAI_CLIP_STD
+    high_res_size = {"height": 1024, "width": 1024}
+    high_res_resample = PILImageResampling.BICUBIC
+
+    def __init__(self, **kwargs: Unpack[DeepseekVLHybridFastImageProcessorKwargs]):
+        if kwargs.get("image_mean", None) is None:
+            background_color = (127, 127, 127)
+        else:
+            background_color = tuple([int(x * 255) for x in kwargs.get("image_mean")])
+        if kwargs.get("high_res_image_mean", None) is None:
+            high_res_background_color = (127, 127, 127)
+        else:
+            high_res_background_color = tuple([int(x * 255) for x in kwargs.get("high_res_image_mean")])
+        super().__init__(**kwargs)
+        self.background_color = tuple(background_color)
+        self.high_res_background_color = tuple(high_res_background_color)
+
+    def resize(
+        self,
+        image: "torch.Tensor",
+        size: SizeDict,
+        min_size: int,
+        interpolation: "F.InterpolationMode" = None,
+        antialias: bool = True,
+        **kwargs,
+    ) -> "torch.Tensor":
+        if size.height is None or size.width is None or size.height != size.width:
+            raise ValueError(
+                f"Output height and width must be the same. Got height={size['height']} and width={size['width']}"
+            )
+        size = size.height
+
+        height, width = image.shape[-2:]
+        max_size = max(height, width)
+
+        delta = size / max_size
+        # Largest side becomes `size` and the other side is scaled according to the aspect ratio.
+        output_size_nonpadded = SizeDict(
+            height=max(int(height * delta), min_size),
+            width=max(int(width * delta), min_size),
+        )
+
+        return super().resize(image, size=output_size_nonpadded, interpolation=interpolation, antialias=antialias)
+
+    def pad_to_square(
+        self,
+        images: "torch.Tensor",
+        background_color: Union[int, tuple[int, int, int]] = 0,
+    ) -> "torch.Tensor":
+        """
+        Pads an image to a square based on the longest edge.
+
+        Args:
+            images (`torch.Tensor`):
+                The images to pad.
+            background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
+                The color to use for the padding. Can be an integer for single channel or a
+                tuple of integers representing for multi-channel images. If passed as integer
+                in mutli-channel mode, it will default to `0` in subsequent channels.
+
+        Returns:
+            `torch.Tensor`: The padded images.
+        """
+        height, width = images.shape[-2:]
+        num_channels = images.shape[1]
+        batch_size = images.shape[0]
+
+        if height == width:
+            return images
+
+        max_dim = max(height, width)
+
+        # Ensure background_color is the correct shape
+        if isinstance(background_color, int):
+            background_color = [background_color]
+        elif len(background_color) != num_channels:
+            raise ValueError(
+                f"background_color must have no more than {num_channels} elements to match the number of channels"
+            )
+
+        padded_images = torch.zeros(
+            (batch_size, num_channels, max_dim, max_dim), dtype=images.dtype, device=images.device
+        )
+        for i, color in enumerate(background_color):
+            padded_images[:, i, :, :] = color
+        if width > height:
+            start = (max_dim - height) // 2
+            padded_images[:, :, start : start + height, :] = images
+        else:
+            start = (max_dim - width) // 2
+            padded_images[:, :, :, start : start + width] = images
+
+        return padded_images
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        high_res_size: SizeDict,
+        min_size: int,
+        interpolation: Optional["F.InterpolationMode"],
+        high_res_interpolation: Optional["F.InterpolationMode"],
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        high_res_image_mean: Optional[Union[float, list[float]]],
+        high_res_image_std: Optional[Union[float, list[float]]],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        do_pad: bool = True,
+        **kwargs,
+    ) -> BatchFeature:
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        high_res_resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_high_res_images = self.resize(
+                    image=stacked_images, size=high_res_size, min_size=min_size, interpolation=high_res_interpolation
+                )
+            high_res_resized_images_grouped[shape] = stacked_high_res_images
+        high_res_resized_images = reorder_images(high_res_resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_high_res_images, grouped_high_res_images_index = group_images_by_shape(
+            high_res_resized_images, disable_grouping=disable_grouping
+        )
+        high_res_padded_images = {}
+        high_res_processed_images_grouped = {}
+        for shape, stacked_high_res_images in grouped_high_res_images.items():
+            if do_pad:
+                stacked_high_res_images = self.pad_to_square(
+                    stacked_high_res_images, background_color=self.high_res_background_color
+                )
+                high_res_padded_images[shape] = stacked_high_res_images
+            # Fused rescale and normalize
+            stacked_high_res_images = self.rescale_and_normalize(
+                stacked_high_res_images,
+                do_rescale,
+                rescale_factor,
+                do_normalize,
+                high_res_image_mean,
+                high_res_image_std,
+            )
+            high_res_processed_images_grouped[shape] = stacked_high_res_images
+        high_res_processed_images = reorder_images(high_res_processed_images_grouped, grouped_high_res_images_index)
+        high_res_processed_images = (
+            torch.stack(high_res_processed_images, dim=0) if return_tensors else high_res_processed_images
+        )
+
+        resized_images_grouped = {}
+        for shape, stacked_high_res_padded_images in high_res_padded_images.items():
+            if do_resize:
+                stacked_images = self.resize(
+                    image=stacked_high_res_padded_images, size=size, min_size=min_size, interpolation=interpolation
+                )
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_high_res_images_index)
+
+        grouped_resized_images, grouped_resized_images_index = group_images_by_shape(
+            resized_images, disable_grouping=disable_grouping
+        )
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_resized_images.items():
+            if do_pad:
+                stacked_images = self.pad_to_square(stacked_images, background_color=self.background_color)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+        processed_images = reorder_images(processed_images_grouped, grouped_resized_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+
+        return BatchFeature(
+            data={"pixel_values": processed_images, "high_res_pixel_values": high_res_processed_images},
+            tensor_type=return_tensors,
+        )
+
+    def _further_process_kwargs(
+        self,
+        size: Optional[SizeDict] = None,
+        high_res_size: Optional[SizeDict] = None,
+        default_to_square: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        high_res_image_mean: Optional[Union[float, list[float]]] = None,
+        high_res_image_std: Optional[Union[float, list[float]]] = None,
+        data_format: Optional[ChannelDimension] = None,
+        **kwargs,
+    ) -> dict:
+        """
+        Update kwargs that need further processing before being validated
+        Can be overridden by subclasses to customize the processing of kwargs.
+        """
+        if kwargs is None:
+            kwargs = {}
+        if size is not None:
+            size = SizeDict(**get_size_dict(size=size, default_to_square=default_to_square))
+        if high_res_size is not None:
+            high_res_size = SizeDict(**get_size_dict(size=high_res_size, default_to_square=default_to_square))
+        if isinstance(image_mean, list):
+            image_mean = tuple(image_mean)
+        if isinstance(image_std, list):
+            image_std = tuple(image_std)
+        if isinstance(high_res_image_mean, list):
+            high_res_image_mean = tuple(high_res_image_mean)
+        if isinstance(high_res_image_std, list):
+            high_res_image_std = tuple(high_res_image_std)
+        if data_format is None:
+            data_format = ChannelDimension.FIRST
+
+        high_res_resample = kwargs.pop("high_res_resample")
+        kwargs["high_res_interpolation"] = (
+            pil_torch_interpolation_mapping[high_res_resample]
+            if isinstance(high_res_resample, (int, PILImageResampling))
+            else high_res_resample
+        )
+
+        kwargs["size"] = size
+        kwargs["high_res_size"] = high_res_size
+        kwargs["default_to_square"] = default_to_square
+        kwargs["image_mean"] = image_mean
+        kwargs["image_std"] = image_std
+        kwargs["high_res_image_mean"] = high_res_image_mean
+        kwargs["high_res_image_std"] = high_res_image_std
+        kwargs["data_format"] = data_format
+
+        return kwargs
+
+
+__all__ = ["DeepseekVLHybridImageProcessorFast"]
--- a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
@@ -20,7 +20,10 @@ import torch.nn as nn
 from ...cache_utils import Cache
 from ...image_processing_utils_fast import (
    BatchFeature,
+    DefaultFastImageProcessorKwargs,
    get_size_dict,
+    group_images_by_shape,
+    reorder_images,
 )
 from ...image_transforms import convert_to_rgb, to_channel_dimension_format
 from ...image_utils import (
@@ -29,6 +32,7 @@ from ...image_utils import (
    ChannelDimension,
    ImageInput,
    PILImageResampling,
+    SizeDict,
    infer_channel_dimension_format,
    is_scaled_image,
    make_flat_list_of_images,
@@ -48,11 +52,14 @@ from ...utils import (
    auto_docstring,
    can_return_tuple,
    filter_out_non_signature_kwargs,
+    is_torchvision_available,
+    is_torchvision_v2_available,
    logging,
 )
 from ..auto import CONFIG_MAPPING, AutoConfig, AutoModel
 from ..deepseek_vl.configuration_deepseek_vl import DeepseekVLConfig
 from ..deepseek_vl.image_processing_deepseek_vl import DeepseekVLImageProcessor
+from ..deepseek_vl.image_processing_deepseek_vl_fast import DeepseekVLImageProcessorFast
 from ..deepseek_vl.modeling_deepseek_vl import (
    DeepseekVLForConditionalGeneration,
    DeepseekVLModel,
@@ -63,6 +70,16 @@ from ..idefics.modeling_idefics import IdeficsBaseModelOutputWithPast, IdeficsCa
 from ..sam.modeling_sam import SamLayerNorm, SamVisionNeck


+if is_torchvision_v2_available():
+    from torchvision.transforms.v2 import functional as F
+
+    from ...image_utils import pil_torch_interpolation_mapping
+elif is_torchvision_available():
+    from torchvision.transforms import functional as F
+
+    from ...image_utils import pil_torch_interpolation_mapping
+
+
 logger = logging.get_logger(__name__)


@@ -516,9 +533,9 @@ class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
        )

        if high_res_image_mean is None:
-            self.background_color = (127, 127, 127)
+            self.high_res_background_color = (127, 127, 127)
        else:
-            self.background_color = tuple([int(x * 255) for x in high_res_image_mean])
+            self.high_res_background_color = tuple([int(x * 255) for x in high_res_image_mean])

    @filter_out_non_signature_kwargs()
    def preprocess(
@@ -654,16 +671,20 @@ class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
            # high_res_image: resize (high) -> rescale -> normalize (high)
            # low_res_image:  resize (high) -> rescale -> resize (low) -> normalize (low)
            high_res_image = image
-
            if do_resize:
                high_res_image = self.resize(
                    image=high_res_image,
                    size=high_res_size_dict,
+                    background_color=self.high_res_background_color,
                    resample=high_res_resample,
                    input_data_format=input_data_format,
                )
                image = self.resize(
-                    image=high_res_image, size=size_dict, resample=resample, input_data_format=input_data_format
+                    image=high_res_image,
+                    size=size_dict,
+                    background_color=self.background_color,
+                    resample=resample,
+                    input_data_format=input_data_format,
                )

            if do_rescale:
@@ -695,6 +716,192 @@ class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
        return BatchFeature(data=data, tensor_type=return_tensors)


+class DeepseekVLHybridFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    r"""
+    min_size (`int`, *optional*, defaults to 14):
+        The minimum allowed size for the resized image. Ensures that neither the height nor width
+        falls below this value after resizing.
+     high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
+        Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
+        method.
+    high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+        Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+        overridden by the `high_res_resample` parameter in the `preprocess` method.
+    high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
+        Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
+        channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
+    high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
+        Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
+        number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
+    """
+
+    min_size: int
+    high_res_size: dict
+    high_res_resample: "PILImageResampling"
+    high_res_image_mean: list[float]
+    high_res_image_std: list[float]
+
+
+class DeepseekVLHybridImageProcessorFast(DeepseekVLImageProcessorFast):
+    high_res_image_mean = OPENAI_CLIP_MEAN
+    high_res_image_std = OPENAI_CLIP_STD
+    high_res_size = {"height": 1024, "width": 1024}
+    high_res_resample = PILImageResampling.BICUBIC
+
+    def __init__(self, **kwargs: Unpack[DeepseekVLHybridFastImageProcessorKwargs]):
+        if kwargs.get("image_mean", None) is None:
+            background_color = (127, 127, 127)
+        else:
+            background_color = tuple([int(x * 255) for x in kwargs.get("image_mean")])
+        if kwargs.get("high_res_image_mean", None) is None:
+            high_res_background_color = (127, 127, 127)
+        else:
+            high_res_background_color = tuple([int(x * 255) for x in kwargs.get("high_res_image_mean")])
+        DeepseekVLImageProcessorFast().__init__(**kwargs)
+        self.background_color = tuple(background_color)
+        self.high_res_background_color = tuple(high_res_background_color)
+
+    def _further_process_kwargs(
+        self,
+        size: Optional[SizeDict] = None,
+        high_res_size: Optional[SizeDict] = None,
+        default_to_square: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        high_res_image_mean: Optional[Union[float, list[float]]] = None,
+        high_res_image_std: Optional[Union[float, list[float]]] = None,
+        data_format: Optional[ChannelDimension] = None,
+        **kwargs,
+    ) -> dict:
+        """
+        Update kwargs that need further processing before being validated
+        Can be overridden by subclasses to customize the processing of kwargs.
+        """
+        if kwargs is None:
+            kwargs = {}
+        if size is not None:
+            size = SizeDict(**get_size_dict(size=size, default_to_square=default_to_square))
+        if high_res_size is not None:
+            high_res_size = SizeDict(**get_size_dict(size=high_res_size, default_to_square=default_to_square))
+        if isinstance(image_mean, list):
+            image_mean = tuple(image_mean)
+        if isinstance(image_std, list):
+            image_std = tuple(image_std)
+        if isinstance(high_res_image_mean, list):
+            high_res_image_mean = tuple(high_res_image_mean)
+        if isinstance(high_res_image_std, list):
+            high_res_image_std = tuple(high_res_image_std)
+        if data_format is None:
+            data_format = ChannelDimension.FIRST
+
+        high_res_resample = kwargs.pop("high_res_resample")
+        kwargs["high_res_interpolation"] = (
+            pil_torch_interpolation_mapping[high_res_resample]
+            if isinstance(high_res_resample, (int, PILImageResampling))
+            else high_res_resample
+        )
+
+        kwargs["size"] = size
+        kwargs["high_res_size"] = high_res_size
+        kwargs["default_to_square"] = default_to_square
+        kwargs["image_mean"] = image_mean
+        kwargs["image_std"] = image_std
+        kwargs["high_res_image_mean"] = high_res_image_mean
+        kwargs["high_res_image_std"] = high_res_image_std
+        kwargs["data_format"] = data_format
+
+        return kwargs
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        high_res_size: SizeDict,
+        min_size: int,
+        interpolation: Optional["F.InterpolationMode"],
+        high_res_interpolation: Optional["F.InterpolationMode"],
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        high_res_image_mean: Optional[Union[float, list[float]]],
+        high_res_image_std: Optional[Union[float, list[float]]],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        do_pad: bool = True,
+        **kwargs,
+    ) -> BatchFeature:
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        high_res_resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_high_res_images = self.resize(
+                    image=stacked_images, size=high_res_size, min_size=min_size, interpolation=high_res_interpolation
+                )
+            high_res_resized_images_grouped[shape] = stacked_high_res_images
+        high_res_resized_images = reorder_images(high_res_resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_high_res_images, grouped_high_res_images_index = group_images_by_shape(
+            high_res_resized_images, disable_grouping=disable_grouping
+        )
+        high_res_padded_images = {}
+        high_res_processed_images_grouped = {}
+        for shape, stacked_high_res_images in grouped_high_res_images.items():
+            if do_pad:
+                stacked_high_res_images = self.pad_to_square(
+                    stacked_high_res_images, background_color=self.high_res_background_color
+                )
+                high_res_padded_images[shape] = stacked_high_res_images
+            # Fused rescale and normalize
+            stacked_high_res_images = self.rescale_and_normalize(
+                stacked_high_res_images,
+                do_rescale,
+                rescale_factor,
+                do_normalize,
+                high_res_image_mean,
+                high_res_image_std,
+            )
+            high_res_processed_images_grouped[shape] = stacked_high_res_images
+        high_res_processed_images = reorder_images(high_res_processed_images_grouped, grouped_high_res_images_index)
+        high_res_processed_images = (
+            torch.stack(high_res_processed_images, dim=0) if return_tensors else high_res_processed_images
+        )
+
+        resized_images_grouped = {}
+        for shape, stacked_high_res_padded_images in high_res_padded_images.items():
+            if do_resize:
+                stacked_images = self.resize(
+                    image=stacked_high_res_padded_images, size=size, min_size=min_size, interpolation=interpolation
+                )
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_high_res_images_index)
+
+        grouped_resized_images, grouped_resized_images_index = group_images_by_shape(
+            resized_images, disable_grouping=disable_grouping
+        )
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_resized_images.items():
+            if do_pad:
+                stacked_images = self.pad_to_square(stacked_images, background_color=self.background_color)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+        processed_images = reorder_images(processed_images_grouped, grouped_resized_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+
+        return BatchFeature(
+            data={"pixel_values": processed_images, "high_res_pixel_values": high_res_processed_images},
+            tensor_type=return_tensors,
+        )
+
+
 class DeepseekVLHybridProcessorKwargs(DeepseekVLProcessorKwargs):
    pass

@@ -773,5 +980,6 @@ __all__ = [
    "DeepseekVLHybridModel",
    "DeepseekVLHybridForConditionalGeneration",
    "DeepseekVLHybridImageProcessor",
+    "DeepseekVLHybridImageProcessorFast",
    "DeepseekVLHybridProcessor",
 ]
--- a/src/transformers/models/janus/init.py
+++ b/src/transformers/models/janus/init.py
@@ -20,6 +20,7 @@ from ...utils.import_utils import define_import_structure
 if TYPE_CHECKING:
    from .configuration_janus import *
    from .image_processing_janus import *
+    from .image_processing_janus_fast import *
    from .modeling_janus import *
    from .processing_janus import *
 else:
--- a/src/transformers/models/janus/image_processing_janus.py
+++ b/src/transformers/models/janus/image_processing_janus.py
@@ -134,6 +134,7 @@ class JanusImageProcessor(BaseImageProcessor):
        self,
        image: np.ndarray,
        size: Union[dict[str, int], int],
+        background_color: Optional[tuple[int, int, int]] = None,
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -145,6 +146,10 @@ class JanusImageProcessor(BaseImageProcessor):
        Args:
            image (`np.ndarray`):
                Image to resize.
+            size (`dict[str, int]` or `int`):
+                The size to resize the image to. If a dictionary, it should have the keys `"height"` and `"width"`.
+            background_color (`tuple[int, int, int]`):
+                The background color to use for the padding.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
            data_format (`ChannelDimension` or `str`, *optional*):
@@ -163,6 +168,7 @@ class JanusImageProcessor(BaseImageProcessor):
        Returns:
            `np.ndarray`: The resized image.
        """
+        background_color = background_color if background_color is not None else self.background_color
        if input_data_format is None:
            input_data_format = infer_channel_dimension_format(image)

@@ -194,7 +200,7 @@ class JanusImageProcessor(BaseImageProcessor):
        # Expand and pad the images to obtain a square image of dimensions `size x size`
        image = self.pad_to_square(
            image=image,
-            background_color=self.background_color,
+            background_color=background_color,
            input_data_format=input_data_format,
        )
        return image
--- a/src/transformers/models/janus/image_processing_janus_fast.py
+++ b/src/transformers/models/janus/image_processing_janus_fast.py
@@ -0,0 +1,245 @@
+# coding=utf-8
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional, Union
+
+from ...image_processing_utils import BatchFeature
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ImageInput,
+    PILImageResampling,
+    SizeDict,
+)
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+)
+
+
+if is_torch_available():
+    import torch
+if is_torchvision_v2_available():
+    from torchvision.transforms.v2 import functional as F
+elif is_torchvision_available():
+    from torchvision.transforms import functional as F
+
+
+class JanusFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    r"""
+    min_size (`int`, *optional*, defaults to 14):
+        The minimum allowed size for the resized image. Ensures that neither the height nor width
+        falls below this value after resizing.
+    """
+
+    min_size: int
+
+
+@auto_docstring
+class JanusImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BICUBIC
+    image_mean = OPENAI_CLIP_MEAN
+    image_std = OPENAI_CLIP_STD
+    size = {"height": 384, "width": 384}
+    min_size = 14
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    valid_kwargs = JanusFastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[JanusFastImageProcessorKwargs]):
+        if kwargs.get("image_mean", None) is None:
+            background_color = (127, 127, 127)
+        else:
+            background_color = tuple([int(x * 255) for x in kwargs.get("image_mean")])
+        super().__init__(**kwargs)
+        self.background_color = tuple(background_color)
+
+    def resize(
+        self,
+        image: "torch.Tensor",
+        size: SizeDict,
+        min_size: int,
+        interpolation: "F.InterpolationMode" = None,
+        antialias: bool = True,
+        **kwargs,
+    ) -> "torch.Tensor":
+        if size.height is None or size.width is None or size.height != size.width:
+            raise ValueError(
+                f"Output height and width must be the same. Got height={size['height']} and width={size['width']}"
+            )
+        size = size.height
+
+        height, width = image.shape[-2:]
+        max_size = max(height, width)
+
+        delta = size / max_size
+        # Largest side becomes `size` and the other side is scaled according to the aspect ratio.
+        output_size_nonpadded = SizeDict(
+            height=max(int(height * delta), min_size),
+            width=max(int(width * delta), min_size),
+        )
+
+        return super().resize(image, size=output_size_nonpadded, interpolation=interpolation, antialias=antialias)
+
+    def pad_to_square(
+        self,
+        images: "torch.Tensor",
+        background_color: Union[int, tuple[int, int, int]] = 0,
+    ) -> "torch.Tensor":
+        """
+        Pads an image to a square based on the longest edge.
+
+        Args:
+            images (`torch.Tensor`):
+                The images to pad.
+            background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
+                The color to use for the padding. Can be an integer for single channel or a
+                tuple of integers representing for multi-channel images. If passed as integer
+                in mutli-channel mode, it will default to `0` in subsequent channels.
+
+        Returns:
+            `torch.Tensor`: The padded images.
+        """
+        height, width = images.shape[-2:]
+        num_channels = images.shape[1]
+        batch_size = images.shape[0]
+
+        if height == width:
+            return images
+
+        max_dim = max(height, width)
+
+        # Ensure background_color is the correct shape
+        if isinstance(background_color, int):
+            background_color = [background_color]
+        elif len(background_color) != num_channels:
+            raise ValueError(
+                f"background_color must have no more than {num_channels} elements to match the number of channels"
+            )
+
+        padded_images = torch.zeros(
+            (batch_size, num_channels, max_dim, max_dim), dtype=images.dtype, device=images.device
+        )
+        for i, color in enumerate(background_color):
+            padded_images[:, i, :, :] = color
+        if width > height:
+            start = (max_dim - height) // 2
+            padded_images[:, :, start : start + height, :] = images
+        else:
+            start = (max_dim - width) // 2
+            padded_images[:, :, :, start : start + width] = images
+
+        return padded_images
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        min_size: int,
+        interpolation: Optional["F.InterpolationMode"],
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        do_pad: bool = True,
+        **kwargs,
+    ) -> BatchFeature:
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(
+                    image=stacked_images, size=size, min_size=min_size, interpolation=interpolation
+                )
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_pad:
+                stacked_images = self.pad_to_square(stacked_images, background_color=self.background_color)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+
+        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+
+    def postprocess(
+        self,
+        images: ImageInput,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[list[float]] = None,
+        image_std: Optional[list[float]] = None,
+        return_tensors: Optional[str] = None,
+    ) -> "torch.Tensor":
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = 1.0 / self.rescale_factor if rescale_factor is None else rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        image_mean = tuple(-rescale_factor * mean / std for mean, std in zip(image_mean, image_std))
+        image_std = tuple(1 / std for std in image_std)
+
+        images = self.preprocess(
+            images,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=False,
+            do_pad=False,
+            return_tensors=return_tensors,
+        ).pixel_values
+        if do_rescale:
+            images = [image.clip(0, 255).to(torch.uint8) for image in images]
+
+        if do_normalize and do_rescale and return_tensors == "PIL.Image.Image":
+            images = [F.to_pil_image(image) for image in images]
+
+        data = {"pixel_values": images}
+        return_tensors = return_tensors if return_tensors != "PIL.Image.Image" else None
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["JanusImageProcessorFast"]
--- a/src/transformers/models/janus/modular_janus.py
+++ b/src/transformers/models/janus/modular_janus.py
@@ -1437,6 +1437,7 @@ class JanusImageProcessor(BlipImageProcessor):
        self,
        image: np.ndarray,
        size: Union[dict[str, int], int],
+        background_color: Optional[tuple[int, int, int]] = None,
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -1448,6 +1449,10 @@ class JanusImageProcessor(BlipImageProcessor):
        Args:
            image (`np.ndarray`):
                Image to resize.
+            size (`dict[str, int]` or `int`):
+                The size to resize the image to. If a dictionary, it should have the keys `"height"` and `"width"`.
+            background_color (`tuple[int, int, int]`):
+                The background color to use for the padding.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
            data_format (`ChannelDimension` or `str`, *optional*):
@@ -1466,6 +1471,7 @@ class JanusImageProcessor(BlipImageProcessor):
        Returns:
            `np.ndarray`: The resized image.
        """
+        background_color = background_color if background_color is not None else self.background_color
        if input_data_format is None:
            input_data_format = infer_channel_dimension_format(image)

@@ -1497,7 +1503,7 @@ class JanusImageProcessor(BlipImageProcessor):
        # Expand and pad the images to obtain a square image of dimensions `size x size`
        image = self.pad_to_square(
            image=image,
-            background_color=self.background_color,
+            background_color=background_color,
            input_data_format=input_data_format,
        )
        return image
--- a/tests/models/deepseek_vl/test_image_processing_deepseek_vl.py
+++ b/tests/models/deepseek_vl/test_image_processing_deepseek_vl.py
@@ -17,14 +17,21 @@
 import unittest

 from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_vision_available
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available

 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs


+if is_torch_available():
+    import torch
+
+
 if is_vision_available():
    from transformers import DeepseekVLImageProcessor

+    if is_torchvision_available():
+        from transformers import DeepseekVLImageProcessorFast
+

 # Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTester with ViT->DeepseekVL
 class DeepseekVLImageProcessingTester:
@@ -83,10 +90,9 @@ class DeepseekVLImageProcessingTester:

@require_torch
@require_vision
-# Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTest with ViT->DeepseekVL
 class DeepseekVLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    # Ignore copy
    image_processing_class = DeepseekVLImageProcessor if is_vision_available() else None
+    fast_image_processing_class = DeepseekVLImageProcessorFast if is_torchvision_available() else None

    def setUp(self):
        super().setUp()
@@ -113,6 +119,33 @@ class DeepseekVLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
            image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42)
            self.assertEqual(image_processor.size, {"height": 42, "width": 42})

+    @require_vision
+    @require_torch
+    def test_slow_fast_equivalence_batched(self):
+        if not self.test_slow_image_processor or not self.test_fast_image_processor:
+            self.skipTest(reason="Skipping slow/fast equivalence test")
+
+        if self.image_processing_class is None or self.fast_image_processing_class is None:
+            self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
+
+        if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
+            self.skipTest(
+                reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
+            )
+
+        dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+        image_processor_slow = self.image_processing_class(**self.image_processor_dict)
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        encoding_slow = image_processor_slow(dummy_images, return_tensors=None)
+        encoding_fast = image_processor_fast(dummy_images, return_tensors=None)
+
+        # Overwrite as the outputs are not always all of the same shape (kept for BC)
+        for i in range(len(encoding_slow.pixel_values)):
+            self._assert_slow_fast_tensors_equivalence(
+                torch.from_numpy(encoding_slow.pixel_values[i]), encoding_fast.pixel_values[i]
+            )
+
    # Ignore copy
    @unittest.skip(reason="Not supported")
    def test_call_numpy_4_channels(self):
--- a/tests/models/deepseek_vl_hybrid/test_image_processing_deepseek_vl_hybrid.py
+++ b/tests/models/deepseek_vl_hybrid/test_image_processing_deepseek_vl_hybrid.py
@@ -13,13 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-
 import unittest

 import numpy as np
+import requests

 from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available

 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs

@@ -32,6 +32,9 @@ if is_vision_available():

    from transformers import DeepseekVLHybridImageProcessor

+    if is_torchvision_available():
+        from transformers import DeepseekVLHybridImageProcessorFast
+

 class DeepseekVLHybridImageProcessingTester:
    def __init__(
@@ -104,6 +107,7 @@ class DeepseekVLHybridImageProcessingTester:
@require_vision
 class DeepseekVLHybridImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = DeepseekVLHybridImageProcessor if is_vision_available() else None
+    fast_image_processing_class = DeepseekVLHybridImageProcessorFast if is_torchvision_available() else None

    # Copied from tests.models.vit.test_image_processing_vit.ViTImageProcessingTester.setUp with ViT->DeepseekVLHybrid
    def setUp(self):
@@ -213,6 +217,59 @@ class DeepseekVLHybridImageProcessingTest(ImageProcessingTestMixin, unittest.Tes
                (self.image_processor_tester.batch_size, *expected_output_image_shape),
            )

+    @require_vision
+    @require_torch
+    def test_slow_fast_equivalence(self):
+        if not self.test_slow_image_processor or not self.test_fast_image_processor:
+            self.skipTest(reason="Skipping slow/fast equivalence test")
+
+        if self.image_processing_class is None or self.fast_image_processing_class is None:
+            self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
+
+        dummy_image = Image.open(
+            requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw
+        )
+        image_processor_slow = self.image_processing_class(**self.image_processor_dict)
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        encoding_slow = image_processor_slow(dummy_image, return_tensors="pt")
+        encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
+        self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
+        self._assert_slow_fast_tensors_equivalence(
+            encoding_slow.high_res_pixel_values, encoding_fast.high_res_pixel_values
+        )
+
+    @require_vision
+    @require_torch
+    def test_slow_fast_equivalence_batched(self):
+        if not self.test_slow_image_processor or not self.test_fast_image_processor:
+            self.skipTest(reason="Skipping slow/fast equivalence test")
+
+        if self.image_processing_class is None or self.fast_image_processing_class is None:
+            self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
+
+        if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
+            self.skipTest(
+                reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
+            )
+
+        dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+        image_processor_slow = self.image_processing_class(**self.image_processor_dict)
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        encoding_slow = image_processor_slow(dummy_images, return_tensors=None)
+        encoding_fast = image_processor_fast(dummy_images, return_tensors=None)
+
+        # Overwrite as the outputs are not always all of the same shape (kept for BC)
+        for i in range(len(encoding_slow.pixel_values)):
+            self._assert_slow_fast_tensors_equivalence(
+                torch.from_numpy(encoding_slow.pixel_values[i]), encoding_fast.pixel_values[i]
+            )
+        for i in range(len(encoding_slow.high_res_pixel_values)):
+            self._assert_slow_fast_tensors_equivalence(
+                torch.from_numpy(encoding_slow.high_res_pixel_values[i]), encoding_fast.high_res_pixel_values[i]
+            )
+
    @unittest.skip(reason="Not supported")
    def test_call_numpy_4_channels(self):
        pass
--- a/tests/models/janus/test_image_processing_janus.py
+++ b/tests/models/janus/test_image_processing_janus.py
@@ -18,7 +18,7 @@ import unittest
 import numpy as np

 from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available

 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs

@@ -31,6 +31,9 @@ if is_vision_available():

    from transformers import JanusImageProcessor

+    if is_torchvision_available():
+        from transformers import JanusImageProcessorFast
+

 class JanusImageProcessingTester:
    def __init__(
@@ -44,8 +47,8 @@ class JanusImageProcessingTester:
        do_resize=True,
        size=None,
        do_normalize=True,
-        image_mean=[1.0, 1.0, 1.0],
-        image_std=[1.0, 1.0, 1.0],
+        image_mean=[0.48145466, 0.4578275, 0.40821073],
+        image_std=[0.26862954, 0.26130258, 0.27577711],
        do_convert_rgb=True,
    ):
        size = size if size is not None else {"height": 384, "width": 384}
@@ -89,6 +92,7 @@ class JanusImageProcessingTester:
@require_vision
 class JanusImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = JanusImageProcessor if is_vision_available() else None
+    fast_image_processing_class = JanusImageProcessorFast if is_torchvision_available() else None

    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->Janus
    def setUp(self):
@@ -101,7 +105,8 @@ class JanusImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        return self.image_processor_tester.prepare_image_processor_dict()

    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
            self.assertTrue(hasattr(image_processing, "do_resize"))
            self.assertTrue(hasattr(image_processing, "size"))
            self.assertTrue(hasattr(image_processing, "do_normalize"))
@@ -110,18 +115,20 @@ class JanusImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
            self.assertTrue(hasattr(image_processing, "do_convert_rgb"))

    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class.from_dict(self.image_processor_dict)
            self.assertEqual(image_processor.size, {"height": 384, "width": 384})
-        self.assertEqual(image_processor.image_mean, [1.0, 1.0, 1.0])
+            self.assertEqual(image_processor.image_mean, [0.48145466, 0.4578275, 0.40821073])

-        image_processor = self.image_processing_class.from_dict(
+            image_processor = image_processing_class.from_dict(
                self.image_processor_dict, size=42, image_mean=[1.0, 2.0, 1.0]
            )
            self.assertEqual(image_processor.size, {"height": 42, "width": 42})
            self.assertEqual(image_processor.image_mean, [1.0, 2.0, 1.0])

    def test_call_pil(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
            for image in image_inputs:
                self.assertIsInstance(image, Image.Image)
@@ -137,7 +144,8 @@ class JanusImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)

    def test_call_numpy(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)
            for image in image_inputs:
                self.assertIsInstance(image, np.ndarray)
@@ -151,7 +159,8 @@ class JanusImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)

    def test_call_pytorch(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)

            for image in image_inputs:
@@ -166,7 +175,8 @@ class JanusImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)

    def test_nested_input(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)

            # Test batched as a list of images.
@@ -183,6 +193,50 @@ class JanusImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
            # Image processor should return same pixel values, independently of input format.
            self.assertTrue((encoded_images_nested == encoded_images).all())

+    @require_vision
+    @require_torch
+    def test_slow_fast_equivalence_batched(self):
+        if not self.test_slow_image_processor or not self.test_fast_image_processor:
+            self.skipTest(reason="Skipping slow/fast equivalence test")
+
+        if self.image_processing_class is None or self.fast_image_processing_class is None:
+            self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
+
+        if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
+            self.skipTest(
+                reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
+            )
+
+        dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+        image_processor_slow = self.image_processing_class(**self.image_processor_dict)
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        encoding_slow = image_processor_slow(dummy_images, return_tensors=None)
+        encoding_fast = image_processor_fast(dummy_images, return_tensors=None)
+
+        # Overwrite as the outputs are not always all of the same shape (kept for BC)
+        for i in range(len(encoding_slow.pixel_values)):
+            self._assert_slow_fast_tensors_equivalence(
+                torch.from_numpy(encoding_slow.pixel_values[i]), encoding_fast.pixel_values[i]
+            )
+
+    @require_vision
+    @require_torch
+    def test_slow_fast_equivalence_postprocess(self):
+        dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+        dummy_images = [image / 255.0 for image in dummy_images]
+        image_processor_slow = self.image_processing_class(**self.image_processor_dict)
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        encoding_slow = image_processor_slow.postprocess(dummy_images, return_tensors=None)
+        encoding_fast = image_processor_fast.postprocess(dummy_images, return_tensors=None)
+
+        # Overwrite as the outputs are not always all of the same shape (kept for BC)
+        for i in range(len(encoding_slow.pixel_values)):
+            self._assert_slow_fast_tensors_equivalence(
+                torch.from_numpy(encoding_slow.pixel_values[i]).float(), encoding_fast.pixel_values[i].float()
+            )
+
    @unittest.skip(reason="Not supported")
    def test_call_numpy_4_channels(self):
        pass