From 33d1d715b0260efc1c2df1c16d864186b5bb9437 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Fri, 14 Feb 2025 17:34:55 +0800 Subject: [PATCH] Add ImageProcessorFast to Qwen2.5-VL processor (#36164) * add qwen2 fast image processor to modular file Signed-off-by: isotr0py <2037008807@qq.com> * fix modular Signed-off-by: isotr0py <2037008807@qq.com> * fix circle import Signed-off-by: isotr0py <2037008807@qq.com> * add docs Signed-off-by: isotr0py <2037008807@qq.com> * fix typo Signed-off-by: isotr0py <2037008807@qq.com> * add modular generated files Signed-off-by: isotr0py <2037008807@qq.com> * revert qwen2vl fast image processor Signed-off-by: isotr0py <2037008807@qq.com> * remove qwen2.5-vl image processor from modular Signed-off-by: isotr0py <2037008807@qq.com> * re-generate qwen2.5-vl files Signed-off-by: isotr0py <2037008807@qq.com> * remove unnecessary test Signed-off-by: isotr0py <2037008807@qq.com> * fix auto map Signed-off-by: isotr0py <2037008807@qq.com> * cleanup Signed-off-by: isotr0py <2037008807@qq.com> * fix model_input_names Signed-off-by: isotr0py <2037008807@qq.com> * remove import Signed-off-by: isotr0py <2037008807@qq.com> * make fix-copies Signed-off-by: isotr0py <2037008807@qq.com> --------- Signed-off-by: isotr0py <2037008807@qq.com> --- docs/source/en/model_doc/qwen2_5_vl.md | 5 - src/transformers/__init__.py | 2 - .../models/auto/image_processing_auto.py | 1 + .../models/qwen2_5_vl/__init__.py | 1 - .../qwen2_5_vl/image_processing_qwen2_5_vl.py | 426 ------------------ .../models/qwen2_5_vl/modular_qwen2_5_vl.py | 59 +-- .../qwen2_5_vl/processing_qwen2_5_vl.py | 11 +- .../utils/dummy_vision_objects.py | 7 - .../test_image_processing_qwen2_5_vl.py | 252 ----------- .../qwen2_5_vl/test_processor_qwen2_5_vl.py | 4 +- 10 files changed, 20 insertions(+), 748 deletions(-) delete mode 100644 src/transformers/models/qwen2_5_vl/image_processing_qwen2_5_vl.py delete mode 100644 tests/models/qwen2_5_vl/test_image_processing_qwen2_5_vl.py diff --git a/docs/source/en/model_doc/qwen2_5_vl.md b/docs/source/en/model_doc/qwen2_5_vl.md index df3b8fb896..f08343506b 100644 --- a/docs/source/en/model_doc/qwen2_5_vl.md +++ b/docs/source/en/model_doc/qwen2_5_vl.md @@ -264,11 +264,6 @@ model = Qwen2_5_VLForConditionalGeneration.from_pretrained( [[autodoc]] Qwen2_5_VLConfig -## Qwen2_5_VLImageProcessor - -[[autodoc]] Qwen2_5_VLImageProcessor - - preprocess - ## Qwen2_5_VLProcessor [[autodoc]] Qwen2_5_VLProcessor diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 8b97168ecf..e9c752b854 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1281,7 +1281,6 @@ else: _import_structure["models.pixtral"].append("PixtralImageProcessor") _import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"]) _import_structure["models.pvt"].extend(["PvtImageProcessor"]) - _import_structure["models.qwen2_5_vl"].extend(["Qwen2_5_VLImageProcessor"]) _import_structure["models.qwen2_vl"].extend(["Qwen2VLImageProcessor"]) _import_structure["models.rt_detr"].extend(["RTDetrImageProcessor"]) _import_structure["models.sam"].extend(["SamImageProcessor"]) @@ -6444,7 +6443,6 @@ if TYPE_CHECKING: PoolFormerImageProcessor, ) from .models.pvt import PvtImageProcessor - from .models.qwen2_5_vl import Qwen2_5_VLImageProcessor from .models.qwen2_vl import Qwen2VLImageProcessor from .models.rt_detr import RTDetrImageProcessor from .models.sam import SamImageProcessor diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 724137bd62..ef4d9b25d1 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -127,6 +127,7 @@ else: ("poolformer", ("PoolFormerImageProcessor",)), ("pvt", ("PvtImageProcessor",)), ("pvt_v2", ("PvtImageProcessor",)), + ("qwen2_5_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")), ("qwen2_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")), ("regnet", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")), ("resnet", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")), diff --git a/src/transformers/models/qwen2_5_vl/__init__.py b/src/transformers/models/qwen2_5_vl/__init__.py index 5d3cd215b0..7a9f44a7a0 100644 --- a/src/transformers/models/qwen2_5_vl/__init__.py +++ b/src/transformers/models/qwen2_5_vl/__init__.py @@ -19,7 +19,6 @@ from ...utils.import_utils import define_import_structure if TYPE_CHECKING: from .configuration_qwen2_5_vl import * - from .image_processing_qwen2_5_vl import * from .modeling_qwen2_5_vl import * from .processing_qwen2_5_vl import * else: diff --git a/src/transformers/models/qwen2_5_vl/image_processing_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/image_processing_qwen2_5_vl.py deleted file mode 100644 index 17afed7d6d..0000000000 --- a/src/transformers/models/qwen2_5_vl/image_processing_qwen2_5_vl.py +++ /dev/null @@ -1,426 +0,0 @@ -# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 -# This file was automatically generated from src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py. -# Do NOT edit this file manually as any edits will be overwritten by the generation of -# the file from the modular. If any change should be done, please apply the change to the -# modular_qwen2_5_vl.py file directly. One of our CI enforces this. -# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 -# coding=utf-8 -# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import math -from typing import Dict, List, Optional, Union - -import numpy as np - -from ...feature_extraction_utils import BatchFeature -from ...image_processing_utils import BaseImageProcessor -from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format -from ...image_utils import ( - OPENAI_CLIP_MEAN, - OPENAI_CLIP_STD, - ChannelDimension, - ImageInput, - PILImageResampling, - VideoInput, - get_image_size, - infer_channel_dimension_format, - is_scaled_image, - make_batched_videos, - make_flat_list_of_images, - make_list_of_images, - to_numpy_array, - valid_images, - validate_preprocess_arguments, -) -from ...utils import TensorType, logging - - -logger = logging.get_logger(__name__) - - -def smart_resize( - height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280 -): - """Rescales the image so that the following conditions are met: - - 1. Both dimensions (height and width) are divisible by 'factor'. - - 2. The total number of pixels is within the range ['min_pixels', 'max_pixels']. - - 3. The aspect ratio of the image is maintained as closely as possible. - - """ - if height < factor or width < factor: - raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}") - elif max(height, width) / min(height, width) > 200: - raise ValueError( - f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}" - ) - h_bar = round(height / factor) * factor - w_bar = round(width / factor) * factor - if h_bar * w_bar > max_pixels: - beta = math.sqrt((height * width) / max_pixels) - h_bar = math.floor(height / beta / factor) * factor - w_bar = math.floor(width / beta / factor) * factor - elif h_bar * w_bar < min_pixels: - beta = math.sqrt(min_pixels / (height * width)) - h_bar = math.ceil(height * beta / factor) * factor - w_bar = math.ceil(width * beta / factor) * factor - return h_bar, w_bar - - -class Qwen2_5_VLImageProcessor(BaseImageProcessor): - r""" - Constructs a Qwen2.5-VL image processor that dynamically resizes images based on the original images. - - Args: - do_resize (`bool`, *optional*, defaults to `True`): - Whether to resize the image's (height, width) dimensions. - resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): - Resampling filter to use when resizing the image. - do_rescale (`bool`, *optional*, defaults to `True`): - Whether to rescale the image by the specified scale `rescale_factor`. - rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): - Scale factor to use if rescaling the image. - do_normalize (`bool`, *optional*, defaults to `True`): - Whether to normalize the image. - image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`): - Mean to use if normalizing the image. This is a float or list of floats for each channel in the image. - image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`): - Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image. - do_convert_rgb (`bool`, *optional*, defaults to `True`): - Whether to convert the image to RGB. - min_pixels (`int`, *optional*, defaults to `56 * 56`): - The min pixels of the image to resize the image. - max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`): - The max pixels of the image to resize the image. - patch_size (`int`, *optional*, defaults to 14): - The spacial patch size of the vision encoder. - temporal_patch_size (`int`, *optional*, defaults to 2): - The temporal patch size of the vision encoder. - merge_size (`int`, *optional*, defaults to 2): - The merge size of the vision encoder to llm encoder. - """ - - model_input_names = [ - "pixel_values", - "image_grid_thw", - "pixel_values_videos", - "video_grid_thw", - "second_per_grid_ts", - ] - - def __init__( - self, - do_resize: bool = True, - resample: PILImageResampling = PILImageResampling.BICUBIC, - do_rescale: bool = True, - rescale_factor: Union[int, float] = 1 / 255, - do_normalize: bool = True, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_convert_rgb: bool = True, - min_pixels: int = 56 * 56, - max_pixels: int = 28 * 28 * 1280, - patch_size: int = 14, - temporal_patch_size: int = 2, - merge_size: int = 2, - **kwargs, - ) -> None: - super().__init__(**kwargs) - self.do_resize = do_resize - self.resample = resample - self.do_rescale = do_rescale - self.rescale_factor = rescale_factor - self.do_normalize = do_normalize - self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN - self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD - self.min_pixels = min_pixels - self.max_pixels = max_pixels - self.patch_size = patch_size - self.temporal_patch_size = temporal_patch_size - self.merge_size = merge_size - self.size = {"shortest_edge": min_pixels, "longest_edge": max_pixels} - self.do_convert_rgb = do_convert_rgb - - def _preprocess( - self, - images: Union[ImageInput, VideoInput], - do_resize: bool = None, - resample: PILImageResampling = None, - do_rescale: bool = None, - rescale_factor: float = None, - do_normalize: bool = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_convert_rgb: bool = None, - data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - ): - """ - Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`. - - Args: - images (`ImageInput`): - Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`. - vision_info (`List[Dict]`, *optional*): - Optional list of dictionaries containing additional information about vision inputs. - do_resize (`bool`, *optional*, defaults to `self.do_resize`): - Whether to resize the image. - resample (`PILImageResampling`, *optional*, defaults to `self.resample`): - Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums. - do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): - Whether to rescale the image. - rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): - Scale factor to use if rescaling the image. - do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): - Whether to normalize the image. - image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): - Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image. - image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): - Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image. - do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): - Whether to convert the image to RGB. - data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`): - The channel dimension format for the output image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - Unset: Use the channel dimension format of the input image. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format for the input image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - """ - images = make_list_of_images(images) - - if do_convert_rgb: - images = [convert_to_rgb(image) for image in images] - - # All transformations expect numpy arrays. - images = [to_numpy_array(image) for image in images] - - if do_rescale and is_scaled_image(images[0]): - logger.warning_once( - "It looks like you are trying to rescale already rescaled images. If the input" - " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." - ) - if input_data_format is None: - # We assume that all images have the same channel dimension format. - input_data_format = infer_channel_dimension_format(images[0]) - - height, width = get_image_size(images[0], channel_dim=input_data_format) - resized_height, resized_width = height, width - processed_images = [] - for image in images: - if do_resize: - resized_height, resized_width = smart_resize( - height, - width, - factor=self.patch_size * self.merge_size, - min_pixels=self.min_pixels, - max_pixels=self.max_pixels, - ) - image = resize( - image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format - ) - - if do_rescale: - image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format) - - if do_normalize: - image = self.normalize( - image=image, mean=image_mean, std=image_std, input_data_format=input_data_format - ) - - image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) - processed_images.append(image) - - patches = np.array(processed_images) - if data_format == ChannelDimension.LAST: - patches = patches.transpose(0, 3, 1, 2) - if patches.shape[0] % self.temporal_patch_size != 0: - repeats = np.repeat(patches[-1][np.newaxis], self.temporal_patch_size - 1, axis=0) - patches = np.concatenate([patches, repeats], axis=0) - channel = patches.shape[1] - grid_t = patches.shape[0] // self.temporal_patch_size - grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size - patches = patches.reshape( - grid_t, - self.temporal_patch_size, - channel, - grid_h // self.merge_size, - self.merge_size, - self.patch_size, - grid_w // self.merge_size, - self.merge_size, - self.patch_size, - ) - patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8) - flatten_patches = patches.reshape( - grid_t * grid_h * grid_w, channel * self.temporal_patch_size * self.patch_size * self.patch_size - ) - - return flatten_patches, (grid_t, grid_h, grid_w) - - def preprocess( - self, - images: ImageInput, - videos: VideoInput = None, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_rescale: bool = None, - rescale_factor: float = None, - do_normalize: bool = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_convert_rgb: bool = None, - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - ): - """ - Args: - images (`ImageInput`): - Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If - passing in images with pixel values between 0 and 1, set `do_rescale=False`. - videos (`VideoInput`): - Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If - passing in videos with pixel values between 0 and 1, set `do_rescale=False`. - do_resize (`bool`, *optional*, defaults to `self.do_resize`): - Whether to resize the image. - size (`Dict[str, int]`, *optional*, defaults to `self.size`): - Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with - the longest edge resized to keep the input aspect ratio. - resample (`int`, *optional*, defaults to `self.resample`): - Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only - has an effect if `do_resize` is set to `True`. - do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): - Whether to rescale the image. - rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): - Rescale factor to rescale the image by if `do_rescale` is set to `True`. - do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): - Whether to normalize the image. - image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): - Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. - image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): - Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to - `True`. - do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): - Whether to convert the image to RGB. - return_tensors (`str` or `TensorType`, *optional*): - The type of tensors to return. Can be one of: - - Unset: Return a list of `np.ndarray`. - - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. - - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. - - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. - - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. - data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): - The channel dimension format for the output image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - Unset: Use the channel dimension format of the input image. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format for the input image. If unset, the channel dimension format is inferred - from the input image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - - """ - do_resize = do_resize if do_resize is not None else self.do_resize - size = size if size is not None else self.size - resample = resample if resample is not None else self.resample - do_rescale = do_rescale if do_rescale is not None else self.do_rescale - rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor - do_normalize = do_normalize if do_normalize is not None else self.do_normalize - image_mean = image_mean if image_mean is not None else self.image_mean - image_std = image_std if image_std is not None else self.image_std - do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb - - if images is not None: - images = make_flat_list_of_images(images) - if videos is not None: - videos = make_batched_videos(videos) - - if images is not None and not valid_images(images): - raise ValueError( - "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " - "torch.Tensor, tf.Tensor or jax.ndarray." - ) - - validate_preprocess_arguments( - rescale_factor=rescale_factor, - do_normalize=do_normalize, - image_mean=image_mean, - image_std=image_std, - do_resize=do_resize, - size=size, - resample=resample, - ) - - if images is not None: - pixel_values, vision_grid_thws = [], [] - for image in images: - patches, image_grid_thw = self._preprocess( - image, - do_resize=do_resize, - resample=resample, - do_rescale=do_rescale, - rescale_factor=rescale_factor, - do_normalize=do_normalize, - image_mean=image_mean, - image_std=image_std, - data_format=data_format, - do_convert_rgb=do_convert_rgb, - input_data_format=input_data_format, - ) - pixel_values.extend(patches) - vision_grid_thws.append(image_grid_thw) - pixel_values = np.array(pixel_values) - vision_grid_thws = np.array(vision_grid_thws) - data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws} - - if videos is not None: - pixel_values, vision_grid_thws = [], [] - for images in videos: - patches, video_grid_thw = self._preprocess( - images, - do_resize=do_resize, - resample=resample, - do_rescale=do_rescale, - rescale_factor=rescale_factor, - do_normalize=do_normalize, - image_mean=image_mean, - image_std=image_std, - data_format=data_format, - do_convert_rgb=do_convert_rgb, - input_data_format=input_data_format, - ) - pixel_values.extend(patches) - vision_grid_thws.append(video_grid_thw) - pixel_values = np.array(pixel_values) - vision_grid_thws = np.array(vision_grid_thws) - data = {"pixel_values_videos": pixel_values, "video_grid_thw": vision_grid_thws} - - return BatchFeature(data=data, tensor_type=return_tensors) - - -__all__ = ["Qwen2_5_VLImageProcessor"] diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py index da54b23142..0740de2e21 100644 --- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py @@ -29,7 +29,6 @@ import torch.utils.checkpoint from torch.nn import CrossEntropyLoss from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLConfig -from transformers.models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor from transformers.models.qwen2_vl.modeling_qwen2_vl import ( PatchEmbed, PatchMerger, @@ -854,48 +853,6 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration): return model_inputs -class Qwen2_5_VLImageProcessor(Qwen2VLImageProcessor): - r""" - Constructs a Qwen2.5-VL image processor that dynamically resizes images based on the original images. - - Args: - do_resize (`bool`, *optional*, defaults to `True`): - Whether to resize the image's (height, width) dimensions. - resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): - Resampling filter to use when resizing the image. - do_rescale (`bool`, *optional*, defaults to `True`): - Whether to rescale the image by the specified scale `rescale_factor`. - rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): - Scale factor to use if rescaling the image. - do_normalize (`bool`, *optional*, defaults to `True`): - Whether to normalize the image. - image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`): - Mean to use if normalizing the image. This is a float or list of floats for each channel in the image. - image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`): - Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image. - do_convert_rgb (`bool`, *optional*, defaults to `True`): - Whether to convert the image to RGB. - min_pixels (`int`, *optional*, defaults to `56 * 56`): - The min pixels of the image to resize the image. - max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`): - The max pixels of the image to resize the image. - patch_size (`int`, *optional*, defaults to 14): - The spacial patch size of the vision encoder. - temporal_patch_size (`int`, *optional*, defaults to 2): - The temporal patch size of the vision encoder. - merge_size (`int`, *optional*, defaults to 2): - The merge size of the vision encoder to llm encoder. - """ - - model_input_names = [ - "pixel_values", - "image_grid_thw", - "pixel_values_videos", - "video_grid_thw", - "second_per_grid_ts", - ] - - class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False): fps: Union[List[float], float] @@ -913,10 +870,10 @@ class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False): class Qwen2_5_VLProcessor(Qwen2VLProcessor): r""" Constructs a Qwen2.5-VL processor which wraps a Qwen2.5-VL image processor and a Qwen2 tokenizer into a single processor. - [`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2_5_VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the + [`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the [`~Qwen2_5_VLProcessor.__call__`] and [`~Qwen2_5_VLProcessor.decode`] for more information. Args: - image_processor ([`Qwen2_5_VLImageProcessor`], *optional*): + image_processor ([`Qwen2VLImageProcessor`], *optional*): The image processor is a required input. tokenizer ([`Qwen2TokenizerFast`], *optional*): The tokenizer is a required input. @@ -924,7 +881,14 @@ class Qwen2_5_VLProcessor(Qwen2VLProcessor): in a chat into a tokenizable string. """ - image_processor_class = "Qwen2_5_VLImageProcessor" + image_processor_class = "AutoImageProcessor" + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + return names_from_processor + ["second_per_grid_ts"] def __call__( self, @@ -937,7 +901,7 @@ class Qwen2_5_VLProcessor(Qwen2VLProcessor): Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to - Qwen2_5_VLImageProcessor's [`~Qwen2_5_VLImageProcessor.__call__`] if `vision_infos` is not `None`. + Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`. Args: images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): @@ -1040,6 +1004,5 @@ __all__ = [ "Qwen2_5_VLForConditionalGeneration", "Qwen2_5_VLModel", "Qwen2_5_VLPreTrainedModel", - "Qwen2_5_VLImageProcessor", "Qwen2_5_VLProcessor", ] diff --git a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py index 88c383d8e7..a11010f6c9 100644 --- a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py @@ -48,10 +48,10 @@ class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False): class Qwen2_5_VLProcessor(ProcessorMixin): r""" Constructs a Qwen2.5-VL processor which wraps a Qwen2.5-VL image processor and a Qwen2 tokenizer into a single processor. - [`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2_5_VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the + [`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the [`~Qwen2_5_VLProcessor.__call__`] and [`~Qwen2_5_VLProcessor.decode`] for more information. Args: - image_processor ([`Qwen2_5_VLImageProcessor`], *optional*): + image_processor ([`Qwen2VLImageProcessor`], *optional*): The image processor is a required input. tokenizer ([`Qwen2TokenizerFast`], *optional*): The tokenizer is a required input. @@ -62,7 +62,7 @@ class Qwen2_5_VLProcessor(ProcessorMixin): attributes = ["image_processor", "tokenizer"] valid_kwargs = ["chat_template"] - image_processor_class = "Qwen2_5_VLImageProcessor" + image_processor_class = "AutoImageProcessor" tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast") def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): @@ -81,7 +81,7 @@ class Qwen2_5_VLProcessor(ProcessorMixin): Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to - Qwen2_5_VLImageProcessor's [`~Qwen2_5_VLImageProcessor.__call__`] if `vision_infos` is not `None`. + Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`. Args: images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): @@ -212,7 +212,8 @@ class Qwen2_5_VLProcessor(ProcessorMixin): def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names image_processor_input_names = self.image_processor.model_input_names - return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + return names_from_processor + ["second_per_grid_ts"] __all__ = ["Qwen2_5_VLProcessor"] diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index aeccf53742..64a69ef117 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -590,13 +590,6 @@ class PvtImageProcessor(metaclass=DummyObject): requires_backends(self, ["vision"]) -class Qwen2_5_VLImageProcessor(metaclass=DummyObject): - _backends = ["vision"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["vision"]) - - class Qwen2VLImageProcessor(metaclass=DummyObject): _backends = ["vision"] diff --git a/tests/models/qwen2_5_vl/test_image_processing_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_image_processing_qwen2_5_vl.py deleted file mode 100644 index 4c991ec710..0000000000 --- a/tests/models/qwen2_5_vl/test_image_processing_qwen2_5_vl.py +++ /dev/null @@ -1,252 +0,0 @@ -# coding=utf-8 -# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np - -from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD -from transformers.models.qwen2_5_vl.image_processing_qwen2_5_vl import smart_resize -from transformers.testing_utils import require_torch, require_vision -from transformers.utils import is_torch_available, is_vision_available - -from ...test_image_processing_common import ( - ImageProcessingTestMixin, - prepare_image_inputs, -) - - -if is_torch_available(): - import torch - -if is_vision_available(): - from PIL import Image - - from transformers import Qwen2_5_VLImageProcessor - - -class Qwen2_5_VLImageProcessingTester: - def __init__( - self, - parent, - batch_size=7, - num_channels=3, - min_resolution=56, - max_resolution=1024, - min_pixels=56 * 56, - max_pixels=28 * 28 * 1280, - do_normalize=True, - image_mean=OPENAI_CLIP_MEAN, - image_std=OPENAI_CLIP_STD, - do_resize=True, - patch_size=14, - temporal_patch_size=2, - merge_size=2, - do_convert_rgb=True, - ): - self.parent = parent - self.batch_size = batch_size - self.min_resolution = min_resolution - self.max_resolution = max_resolution - self.num_channels = num_channels - self.image_mean = OPENAI_CLIP_MEAN - self.image_std = OPENAI_CLIP_STD - self.min_pixels = min_pixels - self.max_pixels = max_pixels - self.patch_size = patch_size - self.temporal_patch_size = temporal_patch_size - self.merge_size = merge_size - self.do_resize = do_resize - self.do_normalize = do_normalize - self.image_mean = image_mean - self.image_std = image_std - self.do_convert_rgb = do_convert_rgb - - def prepare_image_processor_dict(self): - return { - "do_resize": self.do_resize, - "image_mean": self.image_mean, - "image_std": self.image_std, - "min_pixels": self.min_pixels, - "max_pixels": self.max_pixels, - "patch_size": self.patch_size, - "temporal_patch_size": self.temporal_patch_size, - "merge_size": self.merge_size, - } - - def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): - images = prepare_image_inputs( - batch_size=self.batch_size, - num_channels=self.num_channels, - min_resolution=self.min_resolution, - max_resolution=self.max_resolution, - equal_resolution=equal_resolution, - numpify=numpify, - torchify=torchify, - ) - return [[image] for image in images] - - -@require_torch -@require_vision -class Qwen2_5_VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): - image_processing_class = Qwen2_5_VLImageProcessor if is_vision_available() else None - - def setUp(self): - super().setUp() - self.image_processor_tester = Qwen2_5_VLImageProcessingTester(self) - - @property - def image_processor_dict(self): - return self.image_processor_tester.prepare_image_processor_dict() - - def test_image_processor_properties(self): - image_processing = self.image_processing_class(**self.image_processor_dict) - self.assertTrue(hasattr(image_processing, "do_normalize")) - self.assertTrue(hasattr(image_processing, "image_mean")) - self.assertTrue(hasattr(image_processing, "image_std")) - self.assertTrue(hasattr(image_processing, "do_resize")) - self.assertTrue(hasattr(image_processing, "min_pixels")) - self.assertTrue(hasattr(image_processing, "max_pixels")) - self.assertTrue(hasattr(image_processing, "do_convert_rgb")) - self.assertTrue(hasattr(image_processing, "patch_size")) - self.assertTrue(hasattr(image_processing, "temporal_patch_size")) - self.assertTrue(hasattr(image_processing, "merge_size")) - - def test_image_processor_from_dict_with_kwargs(self): - image_processor = self.image_processing_class.from_dict(self.image_processor_dict) - self.assertEqual(image_processor.min_pixels, 56 * 56) - self.assertEqual(image_processor.max_pixels, 28 * 28 * 1280) - - image_processor = self.image_processing_class.from_dict( - self.image_processor_dict, min_pixels=256 * 256, max_pixels=640 * 640 - ) - self.assertEqual(image_processor.min_pixels, 256 * 256) - self.assertEqual(image_processor.max_pixels, 640 * 640) - - def test_select_best_resolution(self): - # Test with a final resize resolution - best_resolution = smart_resize(561, 278, factor=28) - self.assertEqual(best_resolution, (560, 280)) - - def test_call_pil(self): - # Initialize image_processing - image_processing = self.image_processing_class(**self.image_processor_dict) - # create random PIL images - image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True) - for image in image_inputs: - self.assertIsInstance(image[0], Image.Image) - - # Test not batched input - prcocess_out = image_processing(image_inputs[0], return_tensors="pt") - encoded_images = prcocess_out.pixel_values - image_grid_thws = prcocess_out.image_grid_thw - expected_output_image_shape = (4900, 1176) - expected_image_grid_thws = torch.Tensor([[1, 70, 70]]) - self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) - self.assertTrue((image_grid_thws == expected_image_grid_thws).all()) - - # Test batched - prcocess_out = image_processing(image_inputs, return_tensors="pt") - encoded_images = prcocess_out.pixel_values - image_grid_thws = prcocess_out.image_grid_thw - expected_output_image_shape = (34300, 1176) - expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7) - self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) - self.assertTrue((image_grid_thws == expected_image_grid_thws).all()) - - def test_call_numpy(self): - # Initialize image_processing - image_processing = self.image_processing_class(**self.image_processor_dict) - # create random numpy tensors - image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True) - for image in image_inputs: - self.assertIsInstance(image[0], np.ndarray) - - # Test not batched input - prcocess_out = image_processing(image_inputs[0], return_tensors="pt") - encoded_images = prcocess_out.pixel_values - image_grid_thws = prcocess_out.image_grid_thw - expected_output_image_shape = (4900, 1176) - expected_image_grid_thws = torch.Tensor([[1, 70, 70]]) - self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) - self.assertTrue((image_grid_thws == expected_image_grid_thws).all()) - - # Test batched - prcocess_out = image_processing(image_inputs, return_tensors="pt") - encoded_images = prcocess_out.pixel_values - image_grid_thws = prcocess_out.image_grid_thw - expected_output_image_shape = (34300, 1176) - expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7) - self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) - self.assertTrue((image_grid_thws == expected_image_grid_thws).all()) - - def test_call_pytorch(self): - # Initialize image_processing - image_processing = self.image_processing_class(**self.image_processor_dict) - # create random PyTorch tensors - image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True) - - for image in image_inputs: - self.assertIsInstance(image[0], torch.Tensor) - - # Test not batched input - prcocess_out = image_processing(image_inputs[0], return_tensors="pt") - encoded_images = prcocess_out.pixel_values - image_grid_thws = prcocess_out.image_grid_thw - expected_output_image_shape = (4900, 1176) - expected_image_grid_thws = torch.Tensor([[1, 70, 70]]) - self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) - self.assertTrue((image_grid_thws == expected_image_grid_thws).all()) - - # Test batched - prcocess_out = image_processing(image_inputs, return_tensors="pt") - encoded_images = prcocess_out.pixel_values - image_grid_thws = prcocess_out.image_grid_thw - expected_output_image_shape = (34300, 1176) - expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7) - self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) - self.assertTrue((image_grid_thws == expected_image_grid_thws).all()) - - @unittest.skip(reason="Qwen2_5_VLImageProcessor doesn't treat 4 channel PIL and numpy consistently yet") - def test_call_numpy_4_channels(self): - pass - - def test_nested_input(self): - image_processing = self.image_processing_class(**self.image_processor_dict) - image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True) - - # Test batched as a list of images - prcocess_out = image_processing(image_inputs, return_tensors="pt") - encoded_images = prcocess_out.pixel_values - image_grid_thws = prcocess_out.image_grid_thw - expected_output_image_shape = (34300, 1176) - expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7) - self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) - self.assertTrue((image_grid_thws == expected_image_grid_thws).all()) - - # Test batched as a nested list of images, where each sublist is one batch - image_inputs_nested = image_inputs[:3] + image_inputs[3:] - prcocess_out = image_processing(image_inputs_nested, return_tensors="pt") - encoded_images_nested = prcocess_out.pixel_values - image_grid_thws_nested = prcocess_out.image_grid_thw - expected_output_image_shape = (34300, 1176) - expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7) - self.assertEqual(tuple(encoded_images_nested.shape), expected_output_image_shape) - self.assertTrue((image_grid_thws == expected_image_grid_thws).all()) - - # Image processor should return same pixel values, independently of ipnut format - self.assertTrue((encoded_images_nested == encoded_images).all()) - self.assertTrue((image_grid_thws_nested == expected_image_grid_thws).all()) diff --git a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py index c85389c073..481e206a71 100644 --- a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py +++ b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py @@ -27,7 +27,7 @@ from ...test_processing_common import ProcessorTesterMixin if is_vision_available(): - from transformers import Qwen2_5_VLImageProcessor, Qwen2_5_VLProcessor + from transformers import Qwen2_5_VLProcessor, Qwen2VLImageProcessor @require_vision @@ -63,7 +63,7 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase): self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab()) self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string()) self.assertIsInstance(processor.tokenizer, Qwen2Tokenizer) - self.assertIsInstance(processor.image_processor, Qwen2_5_VLImageProcessor) + self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessor) def test_image_processor(self): image_processor = self.get_image_processor()