🚨[Fast Image Processor] Force Fast Image Processor for Qwen2_VL/2_5_VL + Refactor (#39591)

* init * Force qwen2VL image proc to fast * refactor qwen2 vl fast * fix copies * Update after PR review and update tests to use return_tensors="pt" * fix processor tests * add BC for min pixels/max pixels
2025-07-25 11:11:28 -04:00
parent f90de364c2
commit 17f02102c5
11 changed files with 222 additions and 283 deletions
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -49,6 +49,9 @@ from .configuration_auto import (
 logger = logging.get_logger(__name__)


+FORCE_FAST_IMAGE_PROCESSOR = ["Qwen2VLImageProcessor"]
+
+
 if TYPE_CHECKING:
    # This significantly improves completion suggestion performance when
    # the transformers package is used with Microsoft's Pylance language server.
@@ -514,6 +517,13 @@ class AutoImageProcessor:
            # if use_fast is not set and the processor was saved with a fast processor, we use it, otherwise we use the slow processor.
            if use_fast is None:
                use_fast = image_processor_type.endswith("Fast")
+                if not use_fast and image_processor_type in FORCE_FAST_IMAGE_PROCESSOR and is_torchvision_available():
+                    use_fast = True
+                    logger.warning_once(
+                        f"The image processor of type `{image_processor_type}` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. "
+                        "This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. "
+                        "Note that this behavior will be extended to all models in a future release."
+                    )
                if not use_fast:
                    logger.warning_once(
                        "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. "
--- a/src/transformers/models/colqwen2/modular_colqwen2.py
+++ b/src/transformers/models/colqwen2/modular_colqwen2.py
@@ -67,7 +67,7 @@ class ColQwen2Processor(ColPaliProcessor):
        query_prefix (`str`, *optional*): A prefix to be used for the query.
    """

-    image_processor_class = "Qwen2VLImageProcessor"
+    image_processor_class = "AutoImageProcessor"
    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")

    def __init__(
--- a/src/transformers/models/colqwen2/processing_colqwen2.py
+++ b/src/transformers/models/colqwen2/processing_colqwen2.py
@@ -66,7 +66,7 @@ class ColQwen2Processor(ProcessorMixin):

    attributes = ["image_processor", "tokenizer"]

-    image_processor_class = "Qwen2VLImageProcessor"
+    image_processor_class = "AutoImageProcessor"
    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")

    def __init__(
--- a/src/transformers/models/glm4v/image_processing_glm4v_fast.py
+++ b/src/transformers/models/glm4v/image_processing_glm4v_fast.py
@@ -138,6 +138,7 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
        processed_images_grouped = {}
        processed_grids = {}
        for shape, stacked_images in grouped_images.items():
+            resized_height, resized_width = stacked_images.shape[-2:]
            # Fused rescale and normalize
            stacked_images = self.rescale_and_normalize(
                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
@@ -188,9 +189,6 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
        images: ImageInput,
        **kwargs: Unpack[Glm4vFastImageProcessorKwargs],
    ) -> BatchFeature:
-        """
-        Preprocess an image or batch of images.
-        """
        return super().preprocess(images, **kwargs)


--- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py
+++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py
@@ -35,9 +35,6 @@ from ...image_utils import (
    ImageInput,
    PILImageResampling,
    SizeDict,
-    get_image_size,
-    make_flat_list_of_images,
-    valid_images,
 )
 from ...processing_utils import Unpack
 from ...utils import (
@@ -57,8 +54,6 @@ if is_torch_available():


 if is_torchvision_available():
-    from ...image_utils import pil_torch_interpolation_mapping
-
    if is_torchvision_v2_available():
        from torchvision.transforms.v2 import functional as F
    else:
@@ -110,18 +105,90 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
        size = kwargs.pop("size", None)
        min_pixels = kwargs.pop("min_pixels", None)
        max_pixels = kwargs.pop("max_pixels", None)
-        if size is not None and ("shortest_edge" not in size or "longest_edge" not in size):
-            raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
-        else:
-            size = self.size
        # backward compatibility: override size with min_pixels and max_pixels if they are provided
+        size = self.size if size is None else size
        if min_pixels is not None:
            size["shortest_edge"] = min_pixels
+            size.pop("min_pixels", None)
        if max_pixels is not None:
            size["longest_edge"] = max_pixels
+            size.pop("max_pixels", None)
+        if "shortest_edge" not in size or "longest_edge" not in size:
+            raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")

        super().__init__(size=size, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs)

+    def _further_process_kwargs(
+        self,
+        size: Optional[SizeDict] = None,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        **kwargs,
+    ) -> dict:
+        """
+        Update kwargs that need further processing before being validated
+        Can be overridden by subclasses to customize the processing of kwargs.
+        """
+        if min_pixels is not None and max_pixels is not None:
+            size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
+        elif size is not None:
+            if "shortest_edge" not in size or "longest_edge" not in size:
+                raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
+            min_pixels = size["shortest_edge"]
+            max_pixels = size["longest_edge"]
+        else:
+            size = {**self.size}
+
+        return super()._further_process_kwargs(size=size, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs)
+
+    @auto_docstring
+    def preprocess(
+        self,
+        images: ImageInput,
+        videos: Optional[VideoInput] = None,
+        **kwargs: Unpack[Qwen2VLFastImageProcessorKwargs],
+    ) -> BatchFeature:
+        return super().preprocess(images, videos, **kwargs)
+
+    def _preprocess_image_like_inputs(
+        self,
+        images: ImageInput,
+        videos: VideoInput,
+        do_convert_rgb: bool,
+        input_data_format: ChannelDimension,
+        device: Optional[Union[str, "torch.device"]] = None,
+        **kwargs: Unpack[DefaultFastImageProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Preprocess image-like inputs.
+        To be overriden by subclasses when image-like inputs other than images should be processed.
+        It can be used for segmentation maps, depth maps, etc.
+        """
+        # Prepare input images
+        batch_feature = BatchFeature()
+        if images is not None:
+            images = self._prepare_image_like_inputs(
+                images=images, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device
+            )
+            batch_feature = self._preprocess(images, **kwargs)
+        if videos is not None:
+            logger.warning(
+                "`Qwen2VLImageProcessorFast` works only with image inputs and doesn't process videos anymore. "
+                "This is a deprecated behavior and will be removed in v5.0. "
+                "Your videos should be forwarded to `Qwen2VLVideoProcessor`. "
+            )
+            # Can't change _prepare_images_structure to work with videos because it also needs to work with images.
+            videos = make_batched_videos(videos)
+            videos = [
+                torch.stack(self._prepare_image_like_inputs(video, do_convert_rgb, input_data_format, device))
+                for video in videos
+            ]
+            video_outputs = self._preprocess(videos, **kwargs)
+            batch_feature.update(
+                {"pixel_values_videos": video_outputs.pixel_values, "video_grid_thw": video_outputs.image_grid_thw}
+            )
+        return batch_feature
+
    def _preprocess(
        self,
        images: list["torch.Tensor"],
@@ -136,65 +203,15 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
        patch_size: int,
        temporal_patch_size: int,
        merge_size: int,
-        do_convert_rgb: bool,
-        input_data_format: Optional[Union[str, ChannelDimension]],
-        device: Optional[Union[str, torch.device]],
        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
    ):
-        """
-        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
-
-        Args:
-            images (`ImageInput`):
-                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
-            vision_info (`list[Dict]`, *optional*):
-                Optional list of dictionaries containing additional information about vision inputs.
-            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
-                Whether to resize the image.
-            size (`dict[str, int]`, *optional*, defaults to `self.size`):
-                Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
-            interpolation (`InterpolationMode`):
-                Resampling filter to use if resizing the image.
-            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
-                Whether to rescale the image.
-            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
-                Scale factor to use if rescaling the image.
-            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
-                Whether to normalize the image.
-            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
-                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
-            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
-                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
-            patch_size (`int`, *optional*, defaults to `self.patch_size`):
-                The spatial patch size of the vision encoder.
-            temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
-                The temporal patch size of the vision encoder.
-            merge_size (`int`, *optional*, defaults to `self.merge_size`):
-                The merge size of the vision encoder to llm encoder.
-            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
-                Whether to convert the image to RGB.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format for the input image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-            device (`torch.device`, *optional*):
-                The device to process the images on. If unset, the device is inferred from the input images.
-        """
-        images = self._prepare_image_like_inputs(
-            images=images,
-            do_convert_rgb=do_convert_rgb,
-            input_data_format=input_data_format,
-            device=device,
-        )
-
-        height, width = get_image_size(images[0], channel_dim=ChannelDimension.FIRST)
-        resized_height, resized_width = height, width
-
        # Group images by size for batched resizing
        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
        resized_images_grouped = {}
        for shape, stacked_images in grouped_images.items():
+            height, width = stacked_images.shape[-2:]
            if do_resize:
                resized_height, resized_width = smart_resize(
                    height,
@@ -215,203 +232,63 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
        # Needed in case do_resize is False, or resize returns images with different sizes
        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
        processed_images_grouped = {}
+        processed_grids = {}
        for shape, stacked_images in grouped_images.items():
+            resized_height, resized_width = stacked_images.shape[-2:]
            # Fused rescale and normalize
-            stacked_images = self.rescale_and_normalize(
+            patches = self.rescale_and_normalize(
                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
            )
-            processed_images_grouped[shape] = stacked_images
+            if patches.ndim == 4:
+                # add a temporal dimension if we have images
+                patches = patches.unsqueeze(1)
+            if patches.shape[1] % temporal_patch_size != 0:
+                repeats = patches[:, -1:].repeat(1, temporal_patch_size - 1, 1, 1, 1)
+                patches = torch.cat([patches, repeats], dim=1)
+            batch_size, grid_t, channel = patches.shape[:3]
+            grid_t = grid_t // temporal_patch_size
+            grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
+
+            patches = patches.view(
+                batch_size,
+                grid_t,
+                temporal_patch_size,
+                channel,
+                grid_h // merge_size,
+                merge_size,
+                patch_size,
+                grid_w // merge_size,
+                merge_size,
+                patch_size,
+            )
+            # Reorder dimensions to group grid and patch information for subsequent flattening.
+            # (batch, grid_t, grid_h, grid_w, merge_h, merge_w, channel, temp_patch_size, patch_h, patch_w)
+            patches = patches.permute(0, 1, 4, 7, 5, 8, 3, 2, 6, 9)
+            flatten_patches = patches.reshape(
+                batch_size,
+                grid_t * grid_h * grid_w,
+                channel * temporal_patch_size * patch_size * patch_size,
+            )
+
+            processed_images_grouped[shape] = flatten_patches
+            processed_grids[shape] = [[grid_t, grid_h, grid_w]] * batch_size

        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
-        patches = torch.stack(processed_images, dim=0)
-        if patches.shape[0] % temporal_patch_size != 0:
-            repeats = patches[-1].unsqueeze(0).repeat(temporal_patch_size - 1, 1, 1, 1)
-            patches = torch.cat([patches, repeats], dim=0)
+        processed_grids = reorder_images(processed_grids, grouped_images_index)
+        pixel_values = torch.cat(processed_images, dim=0)
+        image_grid_thw = torch.tensor(processed_grids)

-        channel = patches.shape[1]
-        grid_t = patches.shape[0] // temporal_patch_size
-        grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
-
-        patches = patches.view(
-            grid_t,
-            temporal_patch_size,
-            channel,
-            grid_h // merge_size,
-            merge_size,
-            patch_size,
-            grid_w // merge_size,
-            merge_size,
-            patch_size,
+        return BatchFeature(
+            data={"pixel_values": pixel_values, "image_grid_thw": image_grid_thw}, tensor_type=return_tensors
        )
-        patches = patches.permute(0, 3, 6, 4, 7, 2, 1, 5, 8)
-        flatten_patches = patches.reshape(
-            grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size
-        )
-
-        return flatten_patches, (grid_t, grid_h, grid_w)
-
-    @auto_docstring
-    def preprocess(
-        self,
-        images: ImageInput,
-        videos: VideoInput = None,
-        do_resize: Optional[bool] = None,
-        size: Optional[dict[str, int]] = None,
-        resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]] = None,
-        do_rescale: Optional[bool] = None,
-        rescale_factor: Optional[float] = None,
-        do_normalize: Optional[bool] = None,
-        image_mean: Optional[Union[float, list[float]]] = None,
-        image_std: Optional[Union[float, list[float]]] = None,
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-        patch_size: Optional[int] = None,
-        temporal_patch_size: Optional[int] = None,
-        merge_size: Optional[int] = None,
-        do_convert_rgb: Optional[bool] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        device: Optional["torch.device"] = None,
-        disable_grouping: Optional[bool] = False,
-        **kwargs,
-    ):
-        r"""
-        min_pixels (`int`, *optional*, defaults to `56 * 56`):
-            The min pixels of the image to resize the image.
-        max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
-            The max pixels of the image to resize the image.
-        patch_size (`int`, *optional*, defaults to 14):
-            The spatial patch size of the vision encoder.
-        temporal_patch_size (`int`, *optional*, defaults to 2):
-            The temporal patch size of the vision encoder.
-        merge_size (`int`, *optional*, defaults to 2):
-            The merge size of the vision encoder to llm encoder.
-        """
-        min_pixels = min_pixels if min_pixels is not None else self.min_pixels
-        max_pixels = max_pixels if max_pixels is not None else self.max_pixels
-
-        if size is not None:
-            if "shortest_edge" not in size or "longest_edge" not in size:
-                raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
-            min_pixels = size["shortest_edge"]
-        elif min_pixels is not None and max_pixels is not None:
-            # backward compatibility: override size with min_pixels and max_pixels if they are provided
-            size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
-        else:
-            size = {**self.size}
-
-        do_resize = do_resize if do_resize is not None else self.do_resize
-        size = size if size is not None else self.size
-        resample = resample if resample is not None else self.resample
-        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
-        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
-        image_mean = image_mean if image_mean is not None else self.image_mean
-        image_std = image_std if image_std is not None else self.image_std
-        patch_size = patch_size if patch_size is not None else self.patch_size
-        temporal_patch_size = temporal_patch_size if temporal_patch_size is not None else self.temporal_patch_size
-        merge_size = merge_size if merge_size is not None else self.merge_size
-        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
-
-        # Make hashable for cache
-        size = SizeDict(**size) if size is not None else None
-        image_mean = tuple(image_mean) if image_mean is not None else None
-        image_std = tuple(image_std) if image_std is not None else None
-
-        self._validate_preprocess_kwargs(
-            do_rescale=do_rescale,
-            rescale_factor=rescale_factor,
-            do_normalize=do_normalize,
-            image_mean=image_mean,
-            image_std=image_std,
-            do_resize=do_resize,
-            size=size,
-            resample=resample,
-            return_tensors=return_tensors,
-            data_format=data_format,
-        )
-        interpolation = (
-            pil_torch_interpolation_mapping[resample] if isinstance(resample, (PILImageResampling, int)) else resample
-        )
-
-        if images is not None:
-            images = make_flat_list_of_images(images)
-
-        if images is not None and not valid_images(images):
-            raise ValueError(
-                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "torch.Tensor, tf.Tensor or jax.ndarray."
-            )
-
-        data = {}
-        if images is not None:
-            pixel_values, vision_grid_thws = [], []
-            for image in images:
-                patches, image_grid_thw = self._preprocess(
-                    image,
-                    do_resize=do_resize,
-                    size=size,
-                    interpolation=interpolation,
-                    do_rescale=do_rescale,
-                    rescale_factor=rescale_factor,
-                    do_normalize=do_normalize,
-                    image_mean=image_mean,
-                    image_std=image_std,
-                    patch_size=patch_size,
-                    temporal_patch_size=temporal_patch_size,
-                    merge_size=merge_size,
-                    do_convert_rgb=do_convert_rgb,
-                    input_data_format=input_data_format,
-                    device=device,
-                    disable_grouping=disable_grouping,
-                )
-                pixel_values.extend(patches)
-                vision_grid_thws.append(image_grid_thw)
-            pixel_values = torch.stack(pixel_values)
-            vision_grid_thws = torch.tensor(vision_grid_thws)
-            data.update({"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws})
-
-        # kept for BC only and should be removed after v5.0
-        if videos is not None:
-            logger.warning(
-                "`Qwen2VLImageProcessorFast` works only with image inputs and doesn't process videos anymore. "
-                "This is a deprecated behavior and will be removed in v5.0. "
-                "Your videos should be forwarded to `Qwen2VLVideoProcessor`. "
-            )
-            videos = make_batched_videos(videos)
-            pixel_values_videos, vision_grid_thws_videos = [], []
-            for images in videos:
-                patches, video_grid_thw = self._preprocess(
-                    images,
-                    do_resize=do_resize,
-                    size=size,
-                    interpolation=interpolation,
-                    do_rescale=do_rescale,
-                    rescale_factor=rescale_factor,
-                    do_normalize=do_normalize,
-                    image_mean=image_mean,
-                    image_std=image_std,
-                    patch_size=patch_size,
-                    temporal_patch_size=temporal_patch_size,
-                    merge_size=merge_size,
-                    do_convert_rgb=do_convert_rgb,
-                    input_data_format=input_data_format,
-                    device=device,
-                    disable_grouping=disable_grouping,
-                )
-                pixel_values_videos.extend(patches)
-                vision_grid_thws_videos.append(video_grid_thw)
-            pixel_values_videos = torch.stack(pixel_values_videos)
-            vision_grid_thws_videos = torch.tensor(vision_grid_thws_videos)
-            data.update({"pixel_values_videos": pixel_values_videos, "video_grid_thw": vision_grid_thws_videos})
-
-        return BatchFeature(data=data, tensor_type=return_tensors)

    def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
        """
        A utility that returns number of image patches for a given image size.

+        Note: Do not remove this method! It is used by vLLM to infer the number of patches and placeholders
+        without an image input.
+
        Args:
            height (`int`):
                Height of the input image.
--- a/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py
@@ -116,8 +116,21 @@ class Qwen2VLVideoProcessor(BaseVideoProcessor):
    model_input_names = ["pixel_values_videos", "video_grid_thw"]

    def __init__(self, **kwargs: Unpack[Qwen2VLVideoProcessorInitKwargs]):
-        super().__init__(**kwargs)
-        self.size = {"shortest_edge": self.min_pixels, "longest_edge": self.max_pixels}
+        size = kwargs.pop("size", None)
+        min_pixels = kwargs.pop("min_pixels", None)
+        max_pixels = kwargs.pop("max_pixels", None)
+        # backward compatibility: override size with min_pixels and max_pixels if they are provided
+        size = self.size if size is None else size
+        if min_pixels is not None:
+            size["shortest_edge"] = min_pixels
+            size.pop("min_pixels", None)
+        if max_pixels is not None:
+            size["longest_edge"] = max_pixels
+            size.pop("max_pixels", None)
+        if "shortest_edge" not in size or "longest_edge" not in size:
+            raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
+
+        super().__init__(size=size, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs)

    def sample_frames(
        self,
--- a/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py
+++ b/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py
@@ -25,10 +25,17 @@ from huggingface_hub import hf_hub_download
 from transformers import (
    AutoProcessor,
    Qwen2_5OmniProcessor,
-    Qwen2Tokenizer,
+    Qwen2TokenizerFast,
    WhisperFeatureExtractor,
 )
-from transformers.testing_utils import require_av, require_librosa, require_torch, require_torchaudio, require_vision
+from transformers.testing_utils import (
+    require_av,
+    require_librosa,
+    require_torch,
+    require_torchaudio,
+    require_torchvision,
+    require_vision,
+)
 from transformers.utils import is_torch_available, is_vision_available

 from ...test_processing_common import ProcessorTesterMixin
@@ -38,12 +45,13 @@ if is_torch_available():
    import torch

 if is_vision_available():
-    from transformers import Qwen2VLImageProcessor
+    from transformers import Qwen2VLImageProcessorFast


@require_vision
@require_torch
@require_torchaudio
+@require_torchvision
 class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    processor_class = Qwen2_5OmniProcessor

@@ -244,13 +252,13 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        )

        processor.save_pretrained(self.tmpdirname)
-        processor = Qwen2_5OmniProcessor.from_pretrained(self.tmpdirname, use_fast=False)
+        processor = Qwen2_5OmniProcessor.from_pretrained(self.tmpdirname, use_fast=True)

        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
        self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
-        self.assertIsInstance(processor.tokenizer, Qwen2Tokenizer)
-        self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessor)
+        self.assertIsInstance(processor.tokenizer, Qwen2TokenizerFast)
+        self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessorFast)
        self.assertIsInstance(processor.feature_extractor, WhisperFeatureExtractor)

    def test_image_processor(self):
@@ -267,8 +275,8 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):

        image_input = self.prepare_image_inputs()

-        input_image_proc = image_processor(image_input, return_tensors="np")
-        input_processor = processor(images=image_input, text="dummy", return_tensors="np")
+        input_image_proc = image_processor(image_input, return_tensors="pt")
+        input_processor = processor(images=image_input, text="dummy", return_tensors="pt")

        for key in input_image_proc.keys():
            self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
--- a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
+++ b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
@@ -20,15 +20,15 @@ import unittest
 import numpy as np
 import pytest

-from transformers import AutoProcessor, Qwen2Tokenizer
-from transformers.testing_utils import require_av, require_torch, require_vision
+from transformers import AutoProcessor, Qwen2TokenizerFast
+from transformers.testing_utils import require_av, require_torch, require_torchvision, require_vision
 from transformers.utils import is_torch_available, is_vision_available

 from ...test_processing_common import ProcessorTesterMixin


 if is_vision_available():
-    from transformers import Qwen2_5_VLProcessor, Qwen2VLImageProcessor
+    from transformers import Qwen2_5_VLProcessor, Qwen2VLImageProcessorFast

 if is_torch_available():
    import torch
@@ -36,6 +36,7 @@ if is_torch_available():

@require_vision
@require_torch
+@require_torchvision
 class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    processor_class = Qwen2_5_VLProcessor

@@ -73,12 +74,12 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
        )
        processor.save_pretrained(self.tmpdirname)
-        processor = Qwen2_5_VLProcessor.from_pretrained(self.tmpdirname, use_fast=False)
+        processor = Qwen2_5_VLProcessor.from_pretrained(self.tmpdirname, use_fast=True)

        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
        self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
-        self.assertIsInstance(processor.tokenizer, Qwen2Tokenizer)
-        self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessor)
+        self.assertIsInstance(processor.tokenizer, Qwen2TokenizerFast)
+        self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessorFast)

    def test_image_processor(self):
        image_processor = self.get_image_processor()
@@ -91,8 +92,8 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):

        image_input = self.prepare_image_inputs()

-        input_image_proc = image_processor(image_input, return_tensors="np")
-        input_processor = processor(images=image_input, text="dummy", return_tensors="np")
+        input_image_proc = image_processor(image_input, return_tensors="pt")
+        input_processor = processor(images=image_input, text="dummy", return_tensors="pt")

        for key in input_image_proc.keys():
            self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
--- a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
@@ -22,7 +22,7 @@ import requests
 from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
 from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
 from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available

 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs, prepare_video_inputs

@@ -35,8 +35,8 @@ if is_vision_available():

    from transformers import Qwen2VLImageProcessor

-    # if is_torchvision_available():
-    #     from transformers import Qwen2VLImageProcessorFast
+    if is_torchvision_available():
+        from transformers import Qwen2VLImageProcessorFast


 class Qwen2VLImageProcessingTester:
@@ -119,7 +119,7 @@ class Qwen2VLImageProcessingTester:
@require_vision
 class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = Qwen2VLImageProcessor if is_vision_available() else None
-    # fast_image_processing_class = Qwen2VLImageProcessorFast if is_torchvision_available() else None
+    fast_image_processing_class = Qwen2VLImageProcessorFast if is_torchvision_available() else None

    def setUp(self):
        super().setUp()
@@ -363,3 +363,34 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")

        self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
+        self.assertEqual(encoding_slow.image_grid_thw.dtype, encoding_fast.image_grid_thw.dtype)
+        self._assert_slow_fast_tensors_equivalence(
+            encoding_slow.image_grid_thw.float(), encoding_fast.image_grid_thw.float()
+        )
+
+    @require_vision
+    @require_torch
+    def test_slow_fast_equivalence_batched(self):
+        if not self.test_slow_image_processor or not self.test_fast_image_processor:
+            self.skipTest(reason="Skipping slow/fast equivalence test")
+
+        if self.image_processing_class is None or self.fast_image_processing_class is None:
+            self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
+
+        if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
+            self.skipTest(
+                reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
+            )
+
+        dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+        image_processor_slow = self.image_processing_class(**self.image_processor_dict)
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        encoding_slow = image_processor_slow(dummy_images, return_tensors="pt")
+        encoding_fast = image_processor_fast(dummy_images, return_tensors="pt")
+
+        self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
+        self.assertEqual(encoding_slow.image_grid_thw.dtype, encoding_fast.image_grid_thw.dtype)
+        self._assert_slow_fast_tensors_equivalence(
+            encoding_slow.image_grid_thw.float(), encoding_fast.image_grid_thw.float()
+        )
--- a/tests/models/qwen2_vl/test_processor_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_processor_qwen2_vl.py
@@ -20,18 +20,18 @@ import unittest
 import numpy as np
 import pytest

-from transformers import AutoProcessor, Qwen2Tokenizer
-from transformers.testing_utils import require_av, require_torch, require_vision
+from transformers import AutoProcessor, Qwen2TokenizerFast
+from transformers.testing_utils import require_av, require_torch, require_torchvision, require_vision
 from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available

 from ...test_processing_common import ProcessorTesterMixin


 if is_vision_available():
-    from transformers import Qwen2VLImageProcessor, Qwen2VLProcessor
+    from transformers import Qwen2VLProcessor

    if is_torchvision_available():
-        from transformers import Qwen2VLVideoProcessor
+        from transformers import Qwen2VLImageProcessorFast, Qwen2VLVideoProcessor

 if is_torch_available():
    import torch
@@ -39,6 +39,7 @@ if is_torch_available():

@require_vision
@require_torch
+@require_torchvision
 class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    processor_class = Qwen2VLProcessor

@@ -76,12 +77,12 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
        )
        processor.save_pretrained(self.tmpdirname)
-        processor = Qwen2VLProcessor.from_pretrained(self.tmpdirname, use_fast=False)
+        processor = Qwen2VLProcessor.from_pretrained(self.tmpdirname, use_fast=True)

        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
        self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
-        self.assertIsInstance(processor.tokenizer, Qwen2Tokenizer)
-        self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessor)
+        self.assertIsInstance(processor.tokenizer, Qwen2TokenizerFast)
+        self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessorFast)
        self.assertIsInstance(processor.video_processor, Qwen2VLVideoProcessor)

    def test_image_processor(self):
@@ -95,8 +96,8 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):

        image_input = self.prepare_image_inputs()

-        input_image_proc = image_processor(image_input, return_tensors="np")
-        input_processor = processor(images=image_input, text="dummy", return_tensors="np")
+        input_image_proc = image_processor(image_input, return_tensors="pt")
+        input_processor = processor(images=image_input, text="dummy", return_tensors="pt")

        for key in input_image_proc.keys():
            self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -937,7 +937,7 @@ class ProcessorTesterMixin:
            "video", batch_size, return_tensors, "videos_input_name", "video_processor", MODALITY_INPUT_DATA["videos"]
        )

-    @parameterized.expand([(1, "np"), (1, "pt"), (2, "np"), (2, "pt")])
+    @parameterized.expand([(1, "pt"), (2, "pt")])  # fast image processors supports only torchvision
    def test_apply_chat_template_image(self, batch_size: int, return_tensors: str):
        self._test_apply_chat_template(
            "image", batch_size, return_tensors, "images_input_name", "image_processor", MODALITY_INPUT_DATA["images"]