🚨[Fast Image Processor] Force Fast Image Processor for Qwen2_VL/2_5_VL + Refactor (#39591)

* init * Force qwen2VL image proc to fast * refactor qwen2 vl fast * fix copies * Update after PR review and update tests to use return_tensors="pt" * fix processor tests * add BC for min pixels/max pixels
2025-07-25 11:11:28 -04:00
parent f90de364c2
commit 17f02102c5
11 changed files with 222 additions and 283 deletions
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -49,6 +49,9 @@ from .configuration_auto import (
 logger = logging.get_logger(__name__)
 FORCE_FAST_IMAGE_PROCESSOR = ["Qwen2VLImageProcessor"]
 if TYPE_CHECKING:
    # This significantly improves completion suggestion performance when
    # the transformers package is used with Microsoft's Pylance language server.
@@ -514,6 +517,13 @@ class AutoImageProcessor:
            # if use_fast is not set and the processor was saved with a fast processor, we use it, otherwise we use the slow processor.
            if use_fast is None:
                use_fast = image_processor_type.endswith("Fast")
                if not use_fast and image_processor_type in FORCE_FAST_IMAGE_PROCESSOR and is_torchvision_available():
                    use_fast = True
                    logger.warning_once(
                        f"The image processor of type `{image_processor_type}` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. "
                        "This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. "
                        "Note that this behavior will be extended to all models in a future release."
                    )
                if not use_fast:
                    logger.warning_once(
                        "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. "
--- a/src/transformers/models/colqwen2/modular_colqwen2.py
+++ b/src/transformers/models/colqwen2/modular_colqwen2.py
@@ -67,7 +67,7 @@ class ColQwen2Processor(ColPaliProcessor):
        query_prefix (`str`, *optional*): A prefix to be used for the query.
    """
-    image_processor_class = "Qwen2VLImageProcessor"
+    image_processor_class = "AutoImageProcessor"
    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
    def __init__(
--- a/src/transformers/models/colqwen2/processing_colqwen2.py
+++ b/src/transformers/models/colqwen2/processing_colqwen2.py
@@ -66,7 +66,7 @@ class ColQwen2Processor(ProcessorMixin):
    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "Qwen2VLImageProcessor"
+    image_processor_class = "AutoImageProcessor"
    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
    def __init__(
--- a/src/transformers/models/glm4v/image_processing_glm4v_fast.py
+++ b/src/transformers/models/glm4v/image_processing_glm4v_fast.py
@@ -138,6 +138,7 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
        processed_images_grouped = {}
        processed_grids = {}
        for shape, stacked_images in grouped_images.items():
            resized_height, resized_width = stacked_images.shape[-2:]
            # Fused rescale and normalize
            stacked_images = self.rescale_and_normalize(
                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
@@ -188,9 +189,6 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
        images: ImageInput,
        **kwargs: Unpack[Glm4vFastImageProcessorKwargs],
    ) -> BatchFeature:
        """
        Preprocess an image or batch of images.
        """
        return super().preprocess(images, **kwargs)
--- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py
+++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py
@@ -35,9 +35,6 @@ from ...image_utils import (
    ImageInput,
    PILImageResampling,
    SizeDict,
    get_image_size,
    make_flat_list_of_images,
    valid_images,
 )
 from ...processing_utils import Unpack
 from ...utils import (
@@ -57,8 +54,6 @@ if is_torch_available():
 if is_torchvision_available():
    from ...image_utils import pil_torch_interpolation_mapping
    if is_torchvision_v2_available():
        from torchvision.transforms.v2 import functional as F
    else:
@@ -110,18 +105,90 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
        size = kwargs.pop("size", None)
        min_pixels = kwargs.pop("min_pixels", None)
        max_pixels = kwargs.pop("max_pixels", None)
        if size is not None and ("shortest_edge" not in size or "longest_edge" not in size):
            raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
        else:
            size = self.size
        # backward compatibility: override size with min_pixels and max_pixels if they are provided
        size = self.size if size is None else size
        if min_pixels is not None:
            size["shortest_edge"] = min_pixels
            size.pop("min_pixels", None)
        if max_pixels is not None:
            size["longest_edge"] = max_pixels
            size.pop("max_pixels", None)
        if "shortest_edge" not in size or "longest_edge" not in size:
            raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
        super().__init__(size=size, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs)
    def _further_process_kwargs(
        self,
        size: Optional[SizeDict] = None,
        min_pixels: Optional[int] = None,
        max_pixels: Optional[int] = None,
        **kwargs,
    ) -> dict:
        """
        Update kwargs that need further processing before being validated
        Can be overridden by subclasses to customize the processing of kwargs.
        """
        if min_pixels is not None and max_pixels is not None:
            size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
        elif size is not None:
            if "shortest_edge" not in size or "longest_edge" not in size:
                raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
            min_pixels = size["shortest_edge"]
            max_pixels = size["longest_edge"]
        else:
            size = {**self.size}
        return super()._further_process_kwargs(size=size, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs)
    @auto_docstring
    def preprocess(
        self,
        images: ImageInput,
        videos: Optional[VideoInput] = None,
        **kwargs: Unpack[Qwen2VLFastImageProcessorKwargs],
    ) -> BatchFeature:
        return super().preprocess(images, videos, **kwargs)
    def _preprocess_image_like_inputs(
        self,
        images: ImageInput,
        videos: VideoInput,
        do_convert_rgb: bool,
        input_data_format: ChannelDimension,
        device: Optional[Union[str, "torch.device"]] = None,
        **kwargs: Unpack[DefaultFastImageProcessorKwargs],
    ) -> BatchFeature:
        """
        Preprocess image-like inputs.
        To be overriden by subclasses when image-like inputs other than images should be processed.
        It can be used for segmentation maps, depth maps, etc.
        """
        # Prepare input images
        batch_feature = BatchFeature()
        if images is not None:
            images = self._prepare_image_like_inputs(
                images=images, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device
            )
            batch_feature = self._preprocess(images, **kwargs)
        if videos is not None:
            logger.warning(
                "`Qwen2VLImageProcessorFast` works only with image inputs and doesn't process videos anymore. "
                "This is a deprecated behavior and will be removed in v5.0. "
                "Your videos should be forwarded to `Qwen2VLVideoProcessor`. "
            )
            # Can't change _prepare_images_structure to work with videos because it also needs to work with images.
            videos = make_batched_videos(videos)
            videos = [
                torch.stack(self._prepare_image_like_inputs(video, do_convert_rgb, input_data_format, device))
                for video in videos
            ]
            video_outputs = self._preprocess(videos, **kwargs)
            batch_feature.update(
                {"pixel_values_videos": video_outputs.pixel_values, "video_grid_thw": video_outputs.image_grid_thw}
            )
        return batch_feature
    def _preprocess(
        self,
        images: list["torch.Tensor"],
@@ -136,65 +203,15 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
        patch_size: int,
        temporal_patch_size: int,
        merge_size: int,
        do_convert_rgb: bool,
        input_data_format: Optional[Union[str, ChannelDimension]],
        device: Optional[Union[str, torch.device]],
        disable_grouping: Optional[bool],
        return_tensors: Optional[Union[str, TensorType]],
        **kwargs,
    ):
        """
        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
        Args:
            images (`ImageInput`):
                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
            vision_info (`list[Dict]`, *optional*):
                Optional list of dictionaries containing additional information about vision inputs.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`dict[str, int]`, *optional*, defaults to `self.size`):
                Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
            interpolation (`InterpolationMode`):
                Resampling filter to use if resizing the image.
            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                Whether to rescale the image.
            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
                Scale factor to use if rescaling the image.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether to normalize the image.
            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
            patch_size (`int`, *optional*, defaults to `self.patch_size`):
                The spatial patch size of the vision encoder.
            temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
                The temporal patch size of the vision encoder.
            merge_size (`int`, *optional*, defaults to `self.merge_size`):
                The merge size of the vision encoder to llm encoder.
            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
                Whether to convert the image to RGB.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
            device (`torch.device`, *optional*):
                The device to process the images on. If unset, the device is inferred from the input images.
        """
        images = self._prepare_image_like_inputs(
            images=images,
            do_convert_rgb=do_convert_rgb,
            input_data_format=input_data_format,
            device=device,
        )
        height, width = get_image_size(images[0], channel_dim=ChannelDimension.FIRST)
        resized_height, resized_width = height, width
        # Group images by size for batched resizing
        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
        resized_images_grouped = {}
        for shape, stacked_images in grouped_images.items():
            height, width = stacked_images.shape[-2:]
            if do_resize:
                resized_height, resized_width = smart_resize(
                    height,
@@ -215,24 +232,25 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
        # Needed in case do_resize is False, or resize returns images with different sizes
        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
        processed_images_grouped = {}
        processed_grids = {}
        for shape, stacked_images in grouped_images.items():
            resized_height, resized_width = stacked_images.shape[-2:]
            # Fused rescale and normalize
-            stacked_images = self.rescale_and_normalize(
+            patches = self.rescale_and_normalize(
                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
            )
-            processed_images_grouped[shape] = stacked_images
+            if patches.ndim == 4:
-
+                # add a temporal dimension if we have images
-        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+                patches = patches.unsqueeze(1)
-        patches = torch.stack(processed_images, dim=0)
+            if patches.shape[1] % temporal_patch_size != 0:
-        if patches.shape[0] % temporal_patch_size != 0:
+                repeats = patches[:, -1:].repeat(1, temporal_patch_size - 1, 1, 1, 1)
-            repeats = patches[-1].unsqueeze(0).repeat(temporal_patch_size - 1, 1, 1, 1)
+                patches = torch.cat([patches, repeats], dim=1)
-            patches = torch.cat([patches, repeats], dim=0)
+            batch_size, grid_t, channel = patches.shape[:3]
-
+            grid_t = grid_t // temporal_patch_size
        channel = patches.shape[1]
        grid_t = patches.shape[0] // temporal_patch_size
            grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
            patches = patches.view(
                batch_size,
                grid_t,
                temporal_patch_size,
                channel,
@@ -243,175 +261,34 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
                merge_size,
                patch_size,
            )
-        patches = patches.permute(0, 3, 6, 4, 7, 2, 1, 5, 8)
+            # Reorder dimensions to group grid and patch information for subsequent flattening.
            # (batch, grid_t, grid_h, grid_w, merge_h, merge_w, channel, temp_patch_size, patch_h, patch_w)
            patches = patches.permute(0, 1, 4, 7, 5, 8, 3, 2, 6, 9)
            flatten_patches = patches.reshape(
-            grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size
+                batch_size,
                grid_t * grid_h * grid_w,
                channel * temporal_patch_size * patch_size * patch_size,
            )
-        return flatten_patches, (grid_t, grid_h, grid_w)
+            processed_images_grouped[shape] = flatten_patches
            processed_grids[shape] = [[grid_t, grid_h, grid_w]] * batch_size
-    @auto_docstring
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
-    def preprocess(
+        processed_grids = reorder_images(processed_grids, grouped_images_index)
-        self,
+        pixel_values = torch.cat(processed_images, dim=0)
-        images: ImageInput,
+        image_grid_thw = torch.tensor(processed_grids)
        videos: VideoInput = None,
        do_resize: Optional[bool] = None,
        size: Optional[dict[str, int]] = None,
        resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]] = None,
        do_rescale: Optional[bool] = None,
        rescale_factor: Optional[float] = None,
        do_normalize: Optional[bool] = None,
        image_mean: Optional[Union[float, list[float]]] = None,
        image_std: Optional[Union[float, list[float]]] = None,
        min_pixels: Optional[int] = None,
        max_pixels: Optional[int] = None,
        patch_size: Optional[int] = None,
        temporal_patch_size: Optional[int] = None,
        merge_size: Optional[int] = None,
        do_convert_rgb: Optional[bool] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        device: Optional["torch.device"] = None,
        disable_grouping: Optional[bool] = False,
        **kwargs,
    ):
        r"""
        min_pixels (`int`, *optional*, defaults to `56 * 56`):
            The min pixels of the image to resize the image.
        max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
            The max pixels of the image to resize the image.
        patch_size (`int`, *optional*, defaults to 14):
            The spatial patch size of the vision encoder.
        temporal_patch_size (`int`, *optional*, defaults to 2):
            The temporal patch size of the vision encoder.
        merge_size (`int`, *optional*, defaults to 2):
            The merge size of the vision encoder to llm encoder.
        """
        min_pixels = min_pixels if min_pixels is not None else self.min_pixels
        max_pixels = max_pixels if max_pixels is not None else self.max_pixels
-        if size is not None:
+        return BatchFeature(
-            if "shortest_edge" not in size or "longest_edge" not in size:
+            data={"pixel_values": pixel_values, "image_grid_thw": image_grid_thw}, tensor_type=return_tensors
                raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
            min_pixels = size["shortest_edge"]
        elif min_pixels is not None and max_pixels is not None:
            # backward compatibility: override size with min_pixels and max_pixels if they are provided
            size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
        else:
            size = {**self.size}
        do_resize = do_resize if do_resize is not None else self.do_resize
        size = size if size is not None else self.size
        resample = resample if resample is not None else self.resample
        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
        image_mean = image_mean if image_mean is not None else self.image_mean
        image_std = image_std if image_std is not None else self.image_std
        patch_size = patch_size if patch_size is not None else self.patch_size
        temporal_patch_size = temporal_patch_size if temporal_patch_size is not None else self.temporal_patch_size
        merge_size = merge_size if merge_size is not None else self.merge_size
        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
        # Make hashable for cache
        size = SizeDict(**size) if size is not None else None
        image_mean = tuple(image_mean) if image_mean is not None else None
        image_std = tuple(image_std) if image_std is not None else None
        self._validate_preprocess_kwargs(
            do_rescale=do_rescale,
            rescale_factor=rescale_factor,
            do_normalize=do_normalize,
            image_mean=image_mean,
            image_std=image_std,
            do_resize=do_resize,
            size=size,
            resample=resample,
            return_tensors=return_tensors,
            data_format=data_format,
        )
        interpolation = (
            pil_torch_interpolation_mapping[resample] if isinstance(resample, (PILImageResampling, int)) else resample
        )
        if images is not None:
            images = make_flat_list_of_images(images)
        if images is not None and not valid_images(images):
            raise ValueError(
                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
                "torch.Tensor, tf.Tensor or jax.ndarray."
            )
        data = {}
        if images is not None:
            pixel_values, vision_grid_thws = [], []
            for image in images:
                patches, image_grid_thw = self._preprocess(
                    image,
                    do_resize=do_resize,
                    size=size,
                    interpolation=interpolation,
                    do_rescale=do_rescale,
                    rescale_factor=rescale_factor,
                    do_normalize=do_normalize,
                    image_mean=image_mean,
                    image_std=image_std,
                    patch_size=patch_size,
                    temporal_patch_size=temporal_patch_size,
                    merge_size=merge_size,
                    do_convert_rgb=do_convert_rgb,
                    input_data_format=input_data_format,
                    device=device,
                    disable_grouping=disable_grouping,
                )
                pixel_values.extend(patches)
                vision_grid_thws.append(image_grid_thw)
            pixel_values = torch.stack(pixel_values)
            vision_grid_thws = torch.tensor(vision_grid_thws)
            data.update({"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws})
        # kept for BC only and should be removed after v5.0
        if videos is not None:
            logger.warning(
                "`Qwen2VLImageProcessorFast` works only with image inputs and doesn't process videos anymore. "
                "This is a deprecated behavior and will be removed in v5.0. "
                "Your videos should be forwarded to `Qwen2VLVideoProcessor`. "
            )
            videos = make_batched_videos(videos)
            pixel_values_videos, vision_grid_thws_videos = [], []
            for images in videos:
                patches, video_grid_thw = self._preprocess(
                    images,
                    do_resize=do_resize,
                    size=size,
                    interpolation=interpolation,
                    do_rescale=do_rescale,
                    rescale_factor=rescale_factor,
                    do_normalize=do_normalize,
                    image_mean=image_mean,
                    image_std=image_std,
                    patch_size=patch_size,
                    temporal_patch_size=temporal_patch_size,
                    merge_size=merge_size,
                    do_convert_rgb=do_convert_rgb,
                    input_data_format=input_data_format,
                    device=device,
                    disable_grouping=disable_grouping,
                )
                pixel_values_videos.extend(patches)
                vision_grid_thws_videos.append(video_grid_thw)
            pixel_values_videos = torch.stack(pixel_values_videos)
            vision_grid_thws_videos = torch.tensor(vision_grid_thws_videos)
            data.update({"pixel_values_videos": pixel_values_videos, "video_grid_thw": vision_grid_thws_videos})
        return BatchFeature(data=data, tensor_type=return_tensors)
    def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
        """
        A utility that returns number of image patches for a given image size.
        Note: Do not remove this method! It is used by vLLM to infer the number of patches and placeholders
        without an image input.
        Args:
            height (`int`):
                Height of the input image.
--- a/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py
@@ -116,8 +116,21 @@ class Qwen2VLVideoProcessor(BaseVideoProcessor):
    model_input_names = ["pixel_values_videos", "video_grid_thw"]
    def __init__(self, **kwargs: Unpack[Qwen2VLVideoProcessorInitKwargs]):
-        super().__init__(**kwargs)
+        size = kwargs.pop("size", None)
-        self.size = {"shortest_edge": self.min_pixels, "longest_edge": self.max_pixels}
+        min_pixels = kwargs.pop("min_pixels", None)
        max_pixels = kwargs.pop("max_pixels", None)
        # backward compatibility: override size with min_pixels and max_pixels if they are provided
        size = self.size if size is None else size
        if min_pixels is not None:
            size["shortest_edge"] = min_pixels
            size.pop("min_pixels", None)
        if max_pixels is not None:
            size["longest_edge"] = max_pixels
            size.pop("max_pixels", None)
        if "shortest_edge" not in size or "longest_edge" not in size:
            raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
        super().__init__(size=size, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs)
    def sample_frames(
        self,
--- a/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py
+++ b/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py
@@ -25,10 +25,17 @@ from huggingface_hub import hf_hub_download
 from transformers import (
    AutoProcessor,
    Qwen2_5OmniProcessor,
-    Qwen2Tokenizer,
+    Qwen2TokenizerFast,
    WhisperFeatureExtractor,
 )
-from transformers.testing_utils import require_av, require_librosa, require_torch, require_torchaudio, require_vision
+from transformers.testing_utils import (
    require_av,
    require_librosa,
    require_torch,
    require_torchaudio,
    require_torchvision,
    require_vision,
 )
 from transformers.utils import is_torch_available, is_vision_available
 from ...test_processing_common import ProcessorTesterMixin
@@ -38,12 +45,13 @@ if is_torch_available():
    import torch
 if is_vision_available():
-    from transformers import Qwen2VLImageProcessor
+    from transformers import Qwen2VLImageProcessorFast
@require_vision
@require_torch
@require_torchaudio
@require_torchvision
 class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    processor_class = Qwen2_5OmniProcessor
@@ -244,13 +252,13 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        )
        processor.save_pretrained(self.tmpdirname)
-        processor = Qwen2_5OmniProcessor.from_pretrained(self.tmpdirname, use_fast=False)
+        processor = Qwen2_5OmniProcessor.from_pretrained(self.tmpdirname, use_fast=True)
        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
        self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
-        self.assertIsInstance(processor.tokenizer, Qwen2Tokenizer)
+        self.assertIsInstance(processor.tokenizer, Qwen2TokenizerFast)
-        self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessor)
+        self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessorFast)
        self.assertIsInstance(processor.feature_extractor, WhisperFeatureExtractor)
    def test_image_processor(self):
@@ -267,8 +275,8 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        image_input = self.prepare_image_inputs()
-        input_image_proc = image_processor(image_input, return_tensors="np")
+        input_image_proc = image_processor(image_input, return_tensors="pt")
-        input_processor = processor(images=image_input, text="dummy", return_tensors="np")
+        input_processor = processor(images=image_input, text="dummy", return_tensors="pt")
        for key in input_image_proc.keys():
            self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
--- a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
+++ b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
@@ -20,15 +20,15 @@ import unittest
 import numpy as np
 import pytest
-from transformers import AutoProcessor, Qwen2Tokenizer
+from transformers import AutoProcessor, Qwen2TokenizerFast
-from transformers.testing_utils import require_av, require_torch, require_vision
+from transformers.testing_utils import require_av, require_torch, require_torchvision, require_vision
 from transformers.utils import is_torch_available, is_vision_available
 from ...test_processing_common import ProcessorTesterMixin
 if is_vision_available():
-    from transformers import Qwen2_5_VLProcessor, Qwen2VLImageProcessor
+    from transformers import Qwen2_5_VLProcessor, Qwen2VLImageProcessorFast
 if is_torch_available():
    import torch
@@ -36,6 +36,7 @@ if is_torch_available():
@require_vision
@require_torch
@require_torchvision
 class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    processor_class = Qwen2_5_VLProcessor
@@ -73,12 +74,12 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
        )
        processor.save_pretrained(self.tmpdirname)
-        processor = Qwen2_5_VLProcessor.from_pretrained(self.tmpdirname, use_fast=False)
+        processor = Qwen2_5_VLProcessor.from_pretrained(self.tmpdirname, use_fast=True)
        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
        self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
-        self.assertIsInstance(processor.tokenizer, Qwen2Tokenizer)
+        self.assertIsInstance(processor.tokenizer, Qwen2TokenizerFast)
-        self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessor)
+        self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessorFast)
    def test_image_processor(self):
        image_processor = self.get_image_processor()
@@ -91,8 +92,8 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        image_input = self.prepare_image_inputs()
-        input_image_proc = image_processor(image_input, return_tensors="np")
+        input_image_proc = image_processor(image_input, return_tensors="pt")
-        input_processor = processor(images=image_input, text="dummy", return_tensors="np")
+        input_processor = processor(images=image_input, text="dummy", return_tensors="pt")
        for key in input_image_proc.keys():
            self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
--- a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
@@ -22,7 +22,7 @@ import requests
 from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
 from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
 from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs, prepare_video_inputs
@@ -35,8 +35,8 @@ if is_vision_available():
    from transformers import Qwen2VLImageProcessor
-    # if is_torchvision_available():
+    if is_torchvision_available():
-    #     from transformers import Qwen2VLImageProcessorFast
+        from transformers import Qwen2VLImageProcessorFast
 class Qwen2VLImageProcessingTester:
@@ -119,7 +119,7 @@ class Qwen2VLImageProcessingTester:
@require_vision
 class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = Qwen2VLImageProcessor if is_vision_available() else None
-    # fast_image_processing_class = Qwen2VLImageProcessorFast if is_torchvision_available() else None
+    fast_image_processing_class = Qwen2VLImageProcessorFast if is_torchvision_available() else None
    def setUp(self):
        super().setUp()
@@ -363,3 +363,34 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
        self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
        self.assertEqual(encoding_slow.image_grid_thw.dtype, encoding_fast.image_grid_thw.dtype)
        self._assert_slow_fast_tensors_equivalence(
            encoding_slow.image_grid_thw.float(), encoding_fast.image_grid_thw.float()
        )
    @require_vision
    @require_torch
    def test_slow_fast_equivalence_batched(self):
        if not self.test_slow_image_processor or not self.test_fast_image_processor:
            self.skipTest(reason="Skipping slow/fast equivalence test")
        if self.image_processing_class is None or self.fast_image_processing_class is None:
            self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
        if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
            self.skipTest(
                reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
            )
        dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
        image_processor_slow = self.image_processing_class(**self.image_processor_dict)
        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
        encoding_slow = image_processor_slow(dummy_images, return_tensors="pt")
        encoding_fast = image_processor_fast(dummy_images, return_tensors="pt")
        self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
        self.assertEqual(encoding_slow.image_grid_thw.dtype, encoding_fast.image_grid_thw.dtype)
        self._assert_slow_fast_tensors_equivalence(
            encoding_slow.image_grid_thw.float(), encoding_fast.image_grid_thw.float()
        )
--- a/tests/models/qwen2_vl/test_processor_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_processor_qwen2_vl.py
@@ -20,18 +20,18 @@ import unittest
 import numpy as np
 import pytest
-from transformers import AutoProcessor, Qwen2Tokenizer
+from transformers import AutoProcessor, Qwen2TokenizerFast
-from transformers.testing_utils import require_av, require_torch, require_vision
+from transformers.testing_utils import require_av, require_torch, require_torchvision, require_vision
 from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
 from ...test_processing_common import ProcessorTesterMixin
 if is_vision_available():
-    from transformers import Qwen2VLImageProcessor, Qwen2VLProcessor
+    from transformers import Qwen2VLProcessor
    if is_torchvision_available():
-        from transformers import Qwen2VLVideoProcessor
+        from transformers import Qwen2VLImageProcessorFast, Qwen2VLVideoProcessor
 if is_torch_available():
    import torch
@@ -39,6 +39,7 @@ if is_torch_available():
@require_vision
@require_torch
@require_torchvision
 class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    processor_class = Qwen2VLProcessor
@@ -76,12 +77,12 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
        )
        processor.save_pretrained(self.tmpdirname)
-        processor = Qwen2VLProcessor.from_pretrained(self.tmpdirname, use_fast=False)
+        processor = Qwen2VLProcessor.from_pretrained(self.tmpdirname, use_fast=True)
        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
        self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
-        self.assertIsInstance(processor.tokenizer, Qwen2Tokenizer)
+        self.assertIsInstance(processor.tokenizer, Qwen2TokenizerFast)
-        self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessor)
+        self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessorFast)
        self.assertIsInstance(processor.video_processor, Qwen2VLVideoProcessor)
    def test_image_processor(self):
@@ -95,8 +96,8 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        image_input = self.prepare_image_inputs()
-        input_image_proc = image_processor(image_input, return_tensors="np")
+        input_image_proc = image_processor(image_input, return_tensors="pt")
-        input_processor = processor(images=image_input, text="dummy", return_tensors="np")
+        input_processor = processor(images=image_input, text="dummy", return_tensors="pt")
        for key in input_image_proc.keys():
            self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -937,7 +937,7 @@ class ProcessorTesterMixin:
            "video", batch_size, return_tensors, "videos_input_name", "video_processor", MODALITY_INPUT_DATA["videos"]
        )
-    @parameterized.expand([(1, "np"), (1, "pt"), (2, "np"), (2, "pt")])
+    @parameterized.expand([(1, "pt"), (2, "pt")])  # fast image processors supports only torchvision
    def test_apply_chat_template_image(self, batch_size: int, return_tensors: str):
        self._test_apply_chat_template(
            "image", batch_size, return_tensors, "images_input_name", "image_processor", MODALITY_INPUT_DATA["images"]