From ea219ed164bead55a5513e8cfaa17a25d5613b9e Mon Sep 17 00:00:00 2001
From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>
Date: Wed, 12 Mar 2025 19:44:05 -0400
Subject: [PATCH] Remove differences between init and preprocess kwargs for
 fast image processors (#36186)

* Remove differences between init and preprocess kwargs in fast image processors

* make modifs got_ocr2

* update gemma3
---
 .../image_processing_utils_fast.py            | 53 +++++++++++--------
 .../image_processing_convnext_fast.py         | 18 ++-----
 .../image_processing_deformable_detr_fast.py  | 29 +++++-----
 .../models/detr/image_processing_detr_fast.py | 31 +++++------
 .../gemma3/image_processing_gemma3_fast.py    | 25 +++------
 .../image_processing_got_ocr2_fast.py         | 18 ++-----
 .../llava/image_processing_llava_fast.py      | 18 ++-----
 .../image_processing_llava_next_fast.py       | 19 ++-----
 .../image_processing_llava_onevision_fast.py  | 19 ++-----
 .../pixtral/image_processing_pixtral_fast.py  | 18 ++-----
 .../image_processing_qwen2_vl_fast.py         |  8 +--
 .../rt_detr/image_processing_rt_detr_fast.py  | 29 +++++-----
 .../models/rt_detr/modular_rt_detr.py         | 22 ++++----
 tests/test_image_processing_common.py         | 24 ++++++---
 utils/modular_model_converter.py              |  3 +-
 15 files changed, 136 insertions(+), 198 deletions(-)

diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py
index f990ce100d..a87db33704 100644
--- a/src/transformers/image_processing_utils_fast.py
+++ b/src/transformers/image_processing_utils_fast.py
@@ -126,7 +126,7 @@ def divide_to_patches(
     return patches
 
 
-class DefaultFastImageProcessorInitKwargs(TypedDict, total=False):
+class DefaultFastImageProcessorKwargs(TypedDict, total=False):
     do_resize: Optional[bool]
     size: Optional[Dict[str, int]]
     default_to_square: Optional[bool]
@@ -139,9 +139,6 @@ class DefaultFastImageProcessorInitKwargs(TypedDict, total=False):
     image_mean: Optional[Union[float, List[float]]]
     image_std: Optional[Union[float, List[float]]]
     do_convert_rgb: Optional[bool]
-
-
-class DefaultFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorInitKwargs):
     return_tensors: Optional[Union[str, TensorType]]
     data_format: Optional[ChannelDimension]
     input_data_format: Optional[Union[str, ChannelDimension]]
@@ -185,8 +182,20 @@ BASE_IMAGE_PROCESSOR_FAST_DOCSTRING = r"""
             Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
             number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
             Can be overridden by the `image_std` parameter in the `preprocess` method.
-        do_convert_rgb (`bool`, *optional*, defaults to `self.image_std`):
-            Whether to convert the image to RGB."""
+        do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+            Whether to convert the image to RGB.
+        return_tensors (`str` or `TensorType`, *optional*, defaults to `self.return_tensors`):
+            Returns stacked tensors if set to `pt, otherwise returns a list of tensors.
+        data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.data_format`):
+            Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors.
+        input_data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.input_data_format`):
+            The channel dimension format for the input image. If unset, the channel dimension format is inferred
+            from the input image. Can be one of:
+            - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+            - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        device (`torch.device`, *optional*, defaults to `self.device`):
+            The device to process the images on. If unset, the device is inferred from the input images."""
 
 BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS = r"""
     Preprocess an image or batch of images.
@@ -219,20 +228,17 @@ BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS = r"""
             `True`.
         do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
             Whether to convert the image to RGB.
-        return_tensors (`str` or `TensorType`, *optional*):
+        return_tensors (`str` or `TensorType`, *optional*, defaults to `self.return_tensors`):
             Returns stacked tensors if set to `pt, otherwise returns a list of tensors.
-        data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
-            The channel dimension format for the output image. Can be one of:
-            - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-            - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-            - Unset: Use the channel dimension format of the input image.
-        input_data_format (`ChannelDimension` or `str`, *optional*):
+        data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.data_format`):
+            Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors.
+        input_data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.input_data_format`):
             The channel dimension format for the input image. If unset, the channel dimension format is inferred
             from the input image. Can be one of:
             - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
             - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
             - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-        device (`torch.device`, *optional*):
+        device (`torch.device`, *optional*, defaults to `self.device`):
             The device to process the images on. If unset, the device is inferred from the input images."""
 
 
@@ -253,13 +259,16 @@ class BaseImageProcessorFast(BaseImageProcessor):
     rescale_factor = 1 / 255
     do_normalize = None
     do_convert_rgb = None
+    return_tensors = None
+    data_format = ChannelDimension.FIRST
+    input_data_format = None
+    device = None
     model_input_names = ["pixel_values"]
-    valid_init_kwargs = DefaultFastImageProcessorInitKwargs
-    valid_preprocess_kwargs = DefaultFastImageProcessorPreprocessKwargs
+    valid_kwargs = DefaultFastImageProcessorKwargs
 
     def __init__(
         self,
-        **kwargs: Unpack[DefaultFastImageProcessorInitKwargs],
+        **kwargs: Unpack[DefaultFastImageProcessorKwargs],
     ) -> None:
         super().__init__(**kwargs)
         size = kwargs.pop("size", self.size)
@@ -270,7 +279,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
         )
         crop_size = kwargs.pop("crop_size", self.crop_size)
         self.crop_size = get_size_dict(crop_size, param_name="crop_size") if crop_size is not None else None
-        for key in self.valid_init_kwargs.__annotations__.keys():
+        for key in self.valid_kwargs.__annotations__.keys():
             kwarg = kwargs.pop(key, None)
             if kwarg is not None:
                 setattr(self, key, kwarg)
@@ -553,14 +562,12 @@ class BaseImageProcessorFast(BaseImageProcessor):
     def preprocess(
         self,
         images: ImageInput,
-        **kwargs: Unpack[DefaultFastImageProcessorPreprocessKwargs],
+        **kwargs: Unpack[DefaultFastImageProcessorKwargs],
     ) -> BatchFeature:
-        validate_kwargs(
-            captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_preprocess_kwargs.__annotations__.keys()
-        )
+        validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_kwargs.__annotations__.keys())
         # Set default kwargs from self. This ensures that if a kwarg is not provided
         # by the user, it gets its default value from the instance, or is set to None.
-        for kwarg_name in self.valid_preprocess_kwargs.__annotations__:
+        for kwarg_name in self.valid_kwargs.__annotations__:
             kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
 
         # Extract parameters that are only used for preparing the input images
diff --git a/src/transformers/models/convnext/image_processing_convnext_fast.py b/src/transformers/models/convnext/image_processing_convnext_fast.py
index c2a8e37d53..19f959f07d 100644
--- a/src/transformers/models/convnext/image_processing_convnext_fast.py
+++ b/src/transformers/models/convnext/image_processing_convnext_fast.py
@@ -21,8 +21,7 @@ from ...image_processing_utils_fast import (
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
     BaseImageProcessorFast,
-    DefaultFastImageProcessorInitKwargs,
-    DefaultFastImageProcessorPreprocessKwargs,
+    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -54,11 +53,7 @@ if is_torchvision_available():
         from torchvision.transforms import functional as F
 
 
-class ConvNextFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
-    crop_pct: Optional[float]
-
-
-class ConvNextFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
+class ConvNextFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     crop_pct: Optional[float]
 
 
@@ -81,10 +76,9 @@ class ConvNextImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
     crop_pct = 224 / 256
-    valid_init_kwargs = ConvNextFastImageProcessorInitKwargs
-    valid_preprocess_kwargs = ConvNextFastImageProcessorPreprocessKwargs
+    valid_kwargs = ConvNextFastImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[ConvNextFastImageProcessorInitKwargs]):
+    def __init__(self, **kwargs: Unpack[ConvNextFastImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     @add_start_docstrings(
@@ -95,9 +89,7 @@ class ConvNextImageProcessorFast(BaseImageProcessorFast):
             overridden by `crop_pct` in the`preprocess` method.
         """,
     )
-    def preprocess(
-        self, images: ImageInput, **kwargs: Unpack[ConvNextFastImageProcessorPreprocessKwargs]
-    ) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[ConvNextFastImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def resize(
diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py
index 2aee1802ce..850370e593 100644
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py
@@ -12,8 +12,7 @@ from ...image_processing_utils_fast import (
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
     BaseImageProcessorFast,
-    DefaultFastImageProcessorInitKwargs,
-    DefaultFastImageProcessorPreprocessKwargs,
+    DefaultFastImageProcessorKwargs,
     SizeDict,
     get_image_size_for_max_height_width,
     get_max_height_width,
@@ -58,21 +57,12 @@ elif is_torchvision_available():
 logger = logging.get_logger(__name__)
 
 
-class DeformableDetrFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
+class DeformableDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     format: Optional[Union[str, AnnotationFormat]]
     do_convert_annotations: Optional[bool]
     do_pad: Optional[bool]
     pad_size: Optional[Dict[str, int]]
-
-
-class DeformableDetrFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
-    format: Optional[AnnotationFormat]
-    annotations: Optional[Dict]
-    do_convert_annotations: Optional[bool]
-    do_pad: Optional[bool]
-    pad_size: Optional[Dict[str, int]]
     return_segmentation_masks: Optional[bool]
-    masks_path: Optional[Union[str, pathlib.Path]]
 
 
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
@@ -294,6 +284,8 @@ def prepare_coco_panoptic_annotation(
             The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
             provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
             height and width in the batch.
+        return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+            Whether to return segmentation masks.
     """,
 )
 class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
@@ -308,10 +300,9 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
     size = {"shortest_edge": 800, "longest_edge": 1333}
     default_to_square = False
     model_input_names = ["pixel_values", "pixel_mask"]
-    valid_init_kwargs = DeformableDetrFastImageProcessorInitKwargs
-    valid_preprocess_kwargs = DeformableDetrFastImageProcessorPreprocessKwargs
+    valid_kwargs = DeformableDetrFastImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[DeformableDetrFastImageProcessorInitKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[DeformableDetrFastImageProcessorKwargs]) -> None:
         if "pad_and_return_pixel_mask" in kwargs:
             kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
 
@@ -605,7 +596,11 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
         """,
     )
     def preprocess(
-        self, images: ImageInput, **kwargs: Unpack[DeformableDetrFastImageProcessorPreprocessKwargs]
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        **kwargs: Unpack[DeformableDetrFastImageProcessorKwargs],
     ) -> BatchFeature:
         if "pad_and_return_pixel_mask" in kwargs:
             kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
@@ -621,7 +616,7 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
             )
             kwargs["size"] = kwargs.pop("max_size")
 
-        return super().preprocess(images, **kwargs)
+        return super().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs)
 
     def _preprocess(
         self,
diff --git a/src/transformers/models/detr/image_processing_detr_fast.py b/src/transformers/models/detr/image_processing_detr_fast.py
index e49b176167..8d29a5796f 100644
--- a/src/transformers/models/detr/image_processing_detr_fast.py
+++ b/src/transformers/models/detr/image_processing_detr_fast.py
@@ -24,8 +24,7 @@ from ...image_processing_utils_fast import (
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
     BaseImageProcessorFast,
-    DefaultFastImageProcessorInitKwargs,
-    DefaultFastImageProcessorPreprocessKwargs,
+    DefaultFastImageProcessorKwargs,
     SizeDict,
     get_image_size_for_max_height_width,
     get_max_height_width,
@@ -283,21 +282,12 @@ def prepare_coco_panoptic_annotation(
     return new_target
 
 
-class DetrFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
+class DetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     format: Optional[Union[str, AnnotationFormat]]
     do_convert_annotations: Optional[bool]
     do_pad: Optional[bool]
     pad_size: Optional[Dict[str, int]]
-
-
-class DetrFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
-    format: Optional[AnnotationFormat]
-    annotations: Optional[Dict]
-    do_convert_annotations: Optional[bool]
-    do_pad: Optional[bool]
-    pad_size: Optional[Dict[str, int]]
     return_segmentation_masks: Optional[bool]
-    masks_path: Optional[Union[str, pathlib.Path]]
 
 
 @add_start_docstrings(
@@ -319,6 +309,8 @@ class DetrFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocess
             The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
             provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
             height and width in the batch.
+        return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+            Whether to return segmentation masks.
     """,
 )
 class DetrImageProcessorFast(BaseImageProcessorFast):
@@ -333,10 +325,9 @@ class DetrImageProcessorFast(BaseImageProcessorFast):
     size = {"shortest_edge": 800, "longest_edge": 1333}
     default_to_square = False
     model_input_names = ["pixel_values", "pixel_mask"]
-    valid_init_kwargs = DetrFastImageProcessorInitKwargs
-    valid_preprocess_kwargs = DetrFastImageProcessorPreprocessKwargs
+    valid_kwargs = DetrFastImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[DetrFastImageProcessorInitKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[DetrFastImageProcessorKwargs]) -> None:
         if "pad_and_return_pixel_mask" in kwargs:
             kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
 
@@ -629,7 +620,13 @@ class DetrImageProcessorFast(BaseImageProcessorFast):
             Path to the directory containing the segmentation masks.
         """,
     )
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[DetrFastImageProcessorPreprocessKwargs]) -> BatchFeature:
+    def preprocess(
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        **kwargs: Unpack[DetrFastImageProcessorKwargs],
+    ) -> BatchFeature:
         if "pad_and_return_pixel_mask" in kwargs:
             kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
             logger.warning_once(
@@ -644,7 +641,7 @@ class DetrImageProcessorFast(BaseImageProcessorFast):
             )
             kwargs["size"] = kwargs.pop("max_size")
 
-        return super().preprocess(images, **kwargs)
+        return super().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs)
 
     def _preprocess(
         self,
diff --git a/src/transformers/models/gemma3/image_processing_gemma3_fast.py b/src/transformers/models/gemma3/image_processing_gemma3_fast.py
index 0a26f25231..50dfcb920f 100644
--- a/src/transformers/models/gemma3/image_processing_gemma3_fast.py
+++ b/src/transformers/models/gemma3/image_processing_gemma3_fast.py
@@ -24,8 +24,7 @@ from ...image_processing_utils_fast import (
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
     BaseImageProcessorFast,
     BatchFeature,
-    DefaultFastImageProcessorInitKwargs,
-    DefaultFastImageProcessorPreprocessKwargs,
+    DefaultFastImageProcessorKwargs,
     get_size_dict,
     group_images_by_shape,
     reorder_images,
@@ -67,14 +66,7 @@ if is_torchvision_available():
 logger = logging.get_logger(__name__)
 
 
-class Gemma3FastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
-    do_pan_and_scan: Optional[bool]
-    pan_and_scan_min_crop_size: Optional[int]
-    pan_and_scan_max_num_crops: Optional[int]
-    pan_and_scan_min_ratio_to_activate: Optional[float]
-
-
-class Gemma3FastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
+class Gemma3FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     do_pan_and_scan: Optional[bool]
     pan_and_scan_min_crop_size: Optional[int]
     pan_and_scan_max_num_crops: Optional[int]
@@ -108,10 +100,9 @@ class Gemma3ImageProcessorFast(BaseImageProcessorFast):
     pan_and_scan_min_crop_size = None
     pan_and_scan_max_num_crops = None
     pan_and_scan_min_ratio_to_activate = None
-    valid_init_kwargs = Gemma3FastImageProcessorInitKwargs
-    valid_preprocess_kwargs = Gemma3FastImageProcessorPreprocessKwargs
+    valid_kwargs = Gemma3FastImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[Gemma3FastImageProcessorInitKwargs]):
+    def __init__(self, **kwargs: Unpack[Gemma3FastImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     def _prepare_images_structure(
@@ -262,14 +253,12 @@ class Gemma3ImageProcessorFast(BaseImageProcessorFast):
     def preprocess(
         self,
         images: ImageInput,
-        **kwargs: Unpack[Gemma3FastImageProcessorPreprocessKwargs],
+        **kwargs: Unpack[Gemma3FastImageProcessorKwargs],
     ) -> BatchFeature:
-        validate_kwargs(
-            captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_preprocess_kwargs.__annotations__.keys()
-        )
+        validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_kwargs.__annotations__.keys())
         # Set default kwargs from self. This ensures that if a kwarg is not provided
         # by the user, it gets its default value from the instance, or is set to None.
-        for kwarg_name in self.valid_preprocess_kwargs.__annotations__:
+        for kwarg_name in self.valid_kwargs.__annotations__:
             kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
 
         # Extract parameters that are only used for preparing the input images
diff --git a/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py b/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py
index 5103f73b11..8498e37803 100644
--- a/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py
+++ b/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py
@@ -21,8 +21,7 @@ from ...image_processing_utils_fast import (
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
     BaseImageProcessorFast,
-    DefaultFastImageProcessorInitKwargs,
-    DefaultFastImageProcessorPreprocessKwargs,
+    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -54,13 +53,7 @@ if is_torchvision_available():
         from torchvision.transforms import functional as F
 
 
-class GotOcr2ImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
-    crop_to_patches: Optional[bool]
-    min_patches: Optional[int]
-    max_patches: Optional[int]
-
-
-class GotOcr2ImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
+class GotOcr2ImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     crop_to_patches: Optional[bool]
     min_patches: Optional[int]
     max_patches: Optional[int]
@@ -93,10 +86,9 @@ class GotOcr2ImageProcessorFast(BaseImageProcessorFast):
     crop_to_patches = False
     min_patches = 1
     max_patches = 12
-    valid_init_kwargs = GotOcr2ImageProcessorInitKwargs
-    valid_preprocess_kwargs = GotOcr2ImageProcessorPreprocessKwargs
+    valid_kwargs = GotOcr2ImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[GotOcr2ImageProcessorInitKwargs]):
+    def __init__(self, **kwargs: Unpack[valid_kwargs]):
         super().__init__(**kwargs)
 
     @add_start_docstrings(
@@ -113,7 +105,7 @@ class GotOcr2ImageProcessorFast(BaseImageProcessorFast):
                 set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
         """,
     )
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[GotOcr2ImageProcessorPreprocessKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[valid_kwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def crop_image_to_patches(
diff --git a/src/transformers/models/llava/image_processing_llava_fast.py b/src/transformers/models/llava/image_processing_llava_fast.py
index e582336e97..d85eb89b7c 100644
--- a/src/transformers/models/llava/image_processing_llava_fast.py
+++ b/src/transformers/models/llava/image_processing_llava_fast.py
@@ -23,8 +23,7 @@ from ...image_processing_utils_fast import (
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
     BaseImageProcessorFast,
-    DefaultFastImageProcessorInitKwargs,
-    DefaultFastImageProcessorPreprocessKwargs,
+    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -61,11 +60,7 @@ if is_torchvision_available():
         from torchvision.transforms import functional as F
 
 
-class LlavaFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
-    do_pad: Optional[bool]
-
-
-class LlavaFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
+class LlavaFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     do_pad: Optional[bool]
 
 
@@ -90,10 +85,9 @@ class LlavaImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
     do_convert_rgb = True
-    valid_init_kwargs = LlavaFastImageProcessorInitKwargs
-    valid_preprocess_kwargs = LlavaFastImageProcessorPreprocessKwargs
+    valid_kwargs = LlavaFastImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[LlavaFastImageProcessorInitKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[LlavaFastImageProcessorKwargs]) -> None:
         super().__init__(**kwargs)
 
     @add_start_docstrings(
@@ -103,9 +97,7 @@ class LlavaImageProcessorFast(BaseImageProcessorFast):
                 Whether to pad the image to a square based on the longest edge. Can be overridden by the `do_pad` parameter
         """,
     )
-    def preprocess(
-        self, images: ImageInput, **kwargs: Unpack[LlavaFastImageProcessorPreprocessKwargs]
-    ) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaFastImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def pad_to_square(
diff --git a/src/transformers/models/llava_next/image_processing_llava_next_fast.py b/src/transformers/models/llava_next/image_processing_llava_next_fast.py
index 1323f303b0..d4caf2a19a 100644
--- a/src/transformers/models/llava_next/image_processing_llava_next_fast.py
+++ b/src/transformers/models/llava_next/image_processing_llava_next_fast.py
@@ -21,8 +21,7 @@ from ...image_processing_utils_fast import (
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
     BaseImageProcessorFast,
-    DefaultFastImageProcessorInitKwargs,
-    DefaultFastImageProcessorPreprocessKwargs,
+    DefaultFastImageProcessorKwargs,
     divide_to_patches,
     group_images_by_shape,
     reorder_images,
@@ -57,12 +56,7 @@ if is_torchvision_available():
         from torchvision.transforms import functional as F
 
 
-class LlavaNextFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
-    image_grid_pinpoints: Optional[List[List[int]]]
-    do_pad: Optional[bool]
-
-
-class LlavaNextFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
+class LlavaNextFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     image_grid_pinpoints: Optional[List[List[int]]]
     do_pad: Optional[bool]
 
@@ -96,10 +90,9 @@ class LlavaNextImageProcessorFast(BaseImageProcessorFast):
     do_convert_rgb = True
     do_pad = True
     image_grid_pinpoints = [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]
-    valid_init_kwargs = LlavaNextFastImageProcessorInitKwargs
-    valid_preprocess_kwargs = LlavaNextFastImageProcessorPreprocessKwargs
+    valid_kwargs = LlavaNextFastImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[LlavaNextFastImageProcessorInitKwargs]):
+    def __init__(self, **kwargs: Unpack[LlavaNextFastImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     @add_start_docstrings(
@@ -113,9 +106,7 @@ class LlavaNextImageProcessorFast(BaseImageProcessorFast):
                     number of patches in the batch. Padding will be applied to the bottom and right with zeros.
         """,
     )
-    def preprocess(
-        self, images: ImageInput, **kwargs: Unpack[LlavaNextFastImageProcessorPreprocessKwargs]
-    ) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaNextFastImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def _prepare_images_structure(
diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py
index 13aa265496..598ac78f53 100644
--- a/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py
+++ b/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py
@@ -12,8 +12,7 @@ from ...image_processing_utils_fast import (
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
     BaseImageProcessorFast,
-    DefaultFastImageProcessorInitKwargs,
-    DefaultFastImageProcessorPreprocessKwargs,
+    DefaultFastImageProcessorKwargs,
     divide_to_patches,
     group_images_by_shape,
     reorder_images,
@@ -40,12 +39,7 @@ else:
     from torchvision.transforms import functional as F
 
 
-class LlavaOnevisionFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
-    image_grid_pinpoints: Optional[List[List[int]]]
-    do_pad: Optional[bool]
-
-
-class LlavaOnevisionFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
+class LlavaOnevisionFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     image_grid_pinpoints: Optional[List[List[int]]]
     do_pad: Optional[bool]
 
@@ -77,11 +71,10 @@ class LlavaOnevisionImageProcessorFast(BaseImageProcessorFast):
     do_convert_rgb = True
     do_pad = True
     image_grid_pinpoints = [[384, 384], [384, 768], [384, 1152], [384, 1536], [384, 1920], [384, 2304], [768, 384], [768, 768], [768, 1152], [768, 1536], [768, 1920], [768, 2304], [1152, 384], [1152, 768], [1152, 1152], [1152, 1536], [1152, 1920], [1152, 2304], [1536, 384], [1536, 768], [1536, 1152], [1536, 1536], [1536, 1920], [1536, 2304], [1920, 384], [1920, 768], [1920, 1152], [1920, 1536], [1920, 1920], [1920, 2304], [2304, 384], [2304, 768], [2304, 1152], [2304, 1536], [2304, 1920], [2304, 2304]]  # fmt: skip
-    valid_init_kwargs = LlavaOnevisionFastImageProcessorInitKwargs
-    valid_preprocess_kwargs = LlavaOnevisionFastImageProcessorPreprocessKwargs
+    valid_kwargs = LlavaOnevisionFastImageProcessorKwargs
     model_input_names = ["pixel_values_videos"]
 
-    def __init__(self, **kwargs: Unpack[LlavaOnevisionFastImageProcessorInitKwargs]):
+    def __init__(self, **kwargs: Unpack[LlavaOnevisionFastImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     @add_start_docstrings(
@@ -95,9 +88,7 @@ class LlavaOnevisionImageProcessorFast(BaseImageProcessorFast):
                     number of patches in the batch. Padding will be applied to the bottom and right with zeros.
         """,
     )
-    def preprocess(
-        self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionFastImageProcessorPreprocessKwargs]
-    ) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionFastImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def _prepare_images_structure(
diff --git a/src/transformers/models/pixtral/image_processing_pixtral_fast.py b/src/transformers/models/pixtral/image_processing_pixtral_fast.py
index f76fe4a716..0cb4673038 100644
--- a/src/transformers/models/pixtral/image_processing_pixtral_fast.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral_fast.py
@@ -21,8 +21,7 @@ from ...image_processing_utils_fast import (
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
     BaseImageProcessorFast,
-    DefaultFastImageProcessorInitKwargs,
-    DefaultFastImageProcessorPreprocessKwargs,
+    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -61,11 +60,7 @@ if is_torchvision_available():
         from torchvision.transforms import functional as F
 
 
-class PixtralFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
-    patch_size: Optional[Dict[str, int]]
-
-
-class PixtralFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
+class PixtralFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     patch_size: Optional[Dict[str, int]]
 
 
@@ -88,10 +83,9 @@ class PixtralImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
     do_convert_rgb = True
-    valid_init_kwargs = PixtralFastImageProcessorInitKwargs
-    valid_preprocess_kwargs = PixtralFastImageProcessorPreprocessKwargs
+    valid_kwargs = PixtralFastImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[PixtralFastImageProcessorInitKwargs]):
+    def __init__(self, **kwargs: Unpack[PixtralFastImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     @add_start_docstrings(
@@ -101,9 +95,7 @@ class PixtralImageProcessorFast(BaseImageProcessorFast):
             Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method.
         """,
     )
-    def preprocess(
-        self, images: ImageInput, **kwargs: Unpack[PixtralFastImageProcessorPreprocessKwargs]
-    ) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[PixtralFastImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def resize(
diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py
index 8f4c233c0b..b54f86bba6 100644
--- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py
+++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py
@@ -25,7 +25,7 @@ from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
     BaseImageProcessorFast,
-    DefaultFastImageProcessorInitKwargs,
+    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -69,7 +69,7 @@ elif is_torchvision_available():
 logger = logging.get_logger(__name__)
 
 
-class Qwen2VLFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
+class Qwen2VLFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     min_pixels: Optional[int]
     max_pixels: Optional[int]
     patch_size: Optional[int]
@@ -107,10 +107,10 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
     merge_size = 2
     min_pixels = 56 * 56
     max_pixels = 28 * 28 * 1280
-    valid_init_kwargs = Qwen2VLFastImageProcessorInitKwargs
+    valid_kwargs = DefaultFastImageProcessorKwargs
     model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"]
 
-    def __init__(self, **kwargs: Unpack[Qwen2VLFastImageProcessorInitKwargs]):
+    def __init__(self, **kwargs: Unpack[Qwen2VLFastImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     def _preprocess(
diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py
index 0c9b4512ad..bd34843645 100644
--- a/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py
+++ b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py
@@ -12,8 +12,7 @@ from ...image_processing_utils_fast import (
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
     BaseImageProcessorFast,
-    DefaultFastImageProcessorInitKwargs,
-    DefaultFastImageProcessorPreprocessKwargs,
+    DefaultFastImageProcessorKwargs,
     SizeDict,
     add_start_docstrings,
     get_image_size_for_max_height_width,
@@ -53,21 +52,12 @@ elif is_torchvision_available():
     from torchvision.transforms import functional as F
 
 
-class RTDetrFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
+class RTDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     format: Optional[Union[str, AnnotationFormat]]
     do_convert_annotations: Optional[bool]
     do_pad: Optional[bool]
     pad_size: Optional[Dict[str, int]]
-
-
-class RTDetrFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
-    format: Optional[AnnotationFormat]
-    annotations: Optional[Dict]
-    do_convert_annotations: Optional[bool]
-    do_pad: Optional[bool]
-    pad_size: Optional[Dict[str, int]]
     return_segmentation_masks: Optional[bool]
-    masks_path: Optional[Union[str, pathlib.Path]]
 
 
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
@@ -151,6 +141,8 @@ def prepare_coco_detection_annotation(
             The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
             provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
             height and width in the batch.
+        return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+            Whether to return segmentation masks.
     """,
 )
 class RTDetrImageProcessorFast(BaseImageProcessorFast):
@@ -165,11 +157,10 @@ class RTDetrImageProcessorFast(BaseImageProcessorFast):
     size = {"height": 640, "width": 640}
     default_to_square = False
     model_input_names = ["pixel_values", "pixel_mask"]
-    valid_init_kwargs = RTDetrFastImageProcessorInitKwargs
-    valid_preprocess_kwargs = RTDetrFastImageProcessorPreprocessKwargs
+    valid_kwargs = RTDetrFastImageProcessorKwargs
     do_convert_annotations = True
 
-    def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorInitKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorKwargs]) -> None:
         # Backwards compatibility
         do_convert_annotations = kwargs.get("do_convert_annotations", None)
         do_normalize = kwargs.get("do_normalize", None)
@@ -424,9 +415,13 @@ class RTDetrImageProcessorFast(BaseImageProcessorFast):
         """,
     )
     def preprocess(
-        self, images: ImageInput, **kwargs: Unpack[RTDetrFastImageProcessorPreprocessKwargs]
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        **kwargs: Unpack[RTDetrFastImageProcessorKwargs],
     ) -> BatchFeature:
-        return super().preprocess(images, **kwargs)
+        return super().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs)
 
     def _preprocess(
         self,
diff --git a/src/transformers/models/rt_detr/modular_rt_detr.py b/src/transformers/models/rt_detr/modular_rt_detr.py
index 101d02c021..e1ee97b4da 100644
--- a/src/transformers/models/rt_detr/modular_rt_detr.py
+++ b/src/transformers/models/rt_detr/modular_rt_detr.py
@@ -2,8 +2,7 @@ import pathlib
 from typing import Dict, List, Optional, Tuple, Union
 
 from transformers.models.detr.image_processing_detr_fast import (
-    DetrFastImageProcessorInitKwargs,
-    DetrFastImageProcessorPreprocessKwargs,
+    DetrFastImageProcessorKwargs,
     DetrImageProcessorFast,
 )
 
@@ -112,11 +111,7 @@ def prepare_coco_detection_annotation(
     return new_target
 
 
-class RTDetrFastImageProcessorInitKwargs(DetrFastImageProcessorInitKwargs):
-    pass
-
-
-class RTDetrFastImageProcessorPreprocessKwargs(DetrFastImageProcessorPreprocessKwargs):
+class RTDetrFastImageProcessorKwargs(DetrFastImageProcessorKwargs):
     pass
 
 
@@ -133,10 +128,9 @@ class RTDetrImageProcessorFast(DetrImageProcessorFast, BaseImageProcessorFast):
     size = {"height": 640, "width": 640}
     default_to_square = False
     model_input_names = ["pixel_values", "pixel_mask"]
-    valid_init_kwargs = RTDetrFastImageProcessorInitKwargs
-    valid_preprocess_kwargs = RTDetrFastImageProcessorPreprocessKwargs
+    valid_kwargs = RTDetrFastImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorInitKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorKwargs]) -> None:
         # Backwards compatibility
         do_convert_annotations = kwargs.get("do_convert_annotations", None)
         do_normalize = kwargs.get("do_normalize", None)
@@ -181,9 +175,13 @@ class RTDetrImageProcessorFast(DetrImageProcessorFast, BaseImageProcessorFast):
         """,
     )
     def preprocess(
-        self, images: ImageInput, **kwargs: Unpack[RTDetrFastImageProcessorPreprocessKwargs]
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        **kwargs: Unpack[RTDetrFastImageProcessorKwargs],
     ) -> BatchFeature:
-        return BaseImageProcessorFast().preprocess(images, **kwargs)
+        return BaseImageProcessorFast().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs)
 
     def prepare_annotation(
         self,
diff --git a/tests/test_image_processing_common.py b/tests/test_image_processing_common.py
index 0df30adada..f42cd6847b 100644
--- a/tests/test_image_processing_common.py
+++ b/tests/test_image_processing_common.py
@@ -311,8 +311,10 @@ class ImageProcessingTestMixin:
         }
         dict_slow_0 = {key: dict_slow_0[key] for key in set(dict_slow_0) & set(dict_slow_1)}
         dict_slow_1 = {key: dict_slow_1[key] for key in set(dict_slow_0) & set(dict_slow_1)}
-        # check that all additional keys are None, except for `default_to_square` which is only set in fast processors
-        self.assertTrue(all(value is None for key, value in difference.items() if key not in ["default_to_square"]))
+        # check that all additional keys are None, except for `default_to_square` and `data_format` which are only set in fast processors
+        self.assertTrue(
+            all(value is None for key, value in difference.items() if key not in ["default_to_square", "data_format"])
+        )
         # check that the remaining keys are the same
         self.assertEqual(dict_slow_0, dict_slow_1)
 
@@ -324,8 +326,10 @@ class ImageProcessingTestMixin:
         }
         dict_fast_0 = {key: dict_fast_0[key] for key in set(dict_fast_0) & set(dict_fast_1)}
         dict_fast_1 = {key: dict_fast_1[key] for key in set(dict_fast_0) & set(dict_fast_1)}
-        # check that all additional keys are None, except for `default_to_square` which is only set in fast processors
-        self.assertTrue(all(value is None for key, value in difference.items() if key not in ["default_to_square"]))
+        # check that all additional keys are None, except for `default_to_square` and `data_format` which are only set in fast processors
+        self.assertTrue(
+            all(value is None for key, value in difference.items() if key not in ["default_to_square", "data_format"])
+        )
         # check that the remaining keys are the same
         self.assertEqual(dict_fast_0, dict_fast_1)
 
@@ -357,8 +361,10 @@ class ImageProcessingTestMixin:
         }
         dict_slow_0 = {key: dict_slow_0[key] for key in set(dict_slow_0) & set(dict_slow_1)}
         dict_slow_1 = {key: dict_slow_1[key] for key in set(dict_slow_0) & set(dict_slow_1)}
-        # check that all additional keys are None, except for `default_to_square` which is only set in fast processors
-        self.assertTrue(all(value is None for key, value in difference.items() if key not in ["default_to_square"]))
+        # check that all additional keys are None, except for `default_to_square` and `data_format` which are only set in fast processors
+        self.assertTrue(
+            all(value is None for key, value in difference.items() if key not in ["default_to_square", "data_format"])
+        )
         # check that the remaining keys are the same
         self.assertEqual(dict_slow_0, dict_slow_1)
 
@@ -370,8 +376,10 @@ class ImageProcessingTestMixin:
         }
         dict_fast_0 = {key: dict_fast_0[key] for key in set(dict_fast_0) & set(dict_fast_1)}
         dict_fast_1 = {key: dict_fast_1[key] for key in set(dict_fast_0) & set(dict_fast_1)}
-        # check that all additional keys are None, except for `default_to_square` which is only set in fast processors
-        self.assertTrue(all(value is None for key, value in difference.items() if key not in ["default_to_square"]))
+        # check that all additional keys are None, except for `default_to_square` and `data_format` which are only set in fast processors
+        self.assertTrue(
+            all(value is None for key, value in difference.items() if key not in ["default_to_square", "data_format"])
+        )
         # check that the remaining keys are the same
         self.assertEqual(dict_fast_0, dict_fast_1)
 
diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py
index 728c5628b1..efb8ac0047 100644
--- a/utils/modular_model_converter.py
+++ b/utils/modular_model_converter.py
@@ -1087,8 +1087,7 @@ TYPE_TO_FILE_TYPE = {
     "Processor": "processing",
     "ImageProcessor": "image_processing",
     "ImageProcessorFast": "image_processing*_fast",  # "*" indicates where to insert the model name before the "_fast" suffix
-    "FastImageProcessorInitKwargs": "image_processing*_fast",
-    "FastImageProcessorPreprocessKwargs": "image_processing*_fast",
+    "FastImageProcessorKwargs": "image_processing*_fast",
     "FeatureExtractor": "feature_extractor",
     "ProcessorKwargs": "processing",
     "ImagesKwargs": "processing",