diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 3d7be9f18b..cd0473a2d7 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -49,6 +49,9 @@ from .configuration_auto import ( logger = logging.get_logger(__name__) +FORCE_FAST_IMAGE_PROCESSOR = ["Qwen2VLImageProcessor"] + + if TYPE_CHECKING: # This significantly improves completion suggestion performance when # the transformers package is used with Microsoft's Pylance language server. @@ -514,6 +517,13 @@ class AutoImageProcessor: # if use_fast is not set and the processor was saved with a fast processor, we use it, otherwise we use the slow processor. if use_fast is None: use_fast = image_processor_type.endswith("Fast") + if not use_fast and image_processor_type in FORCE_FAST_IMAGE_PROCESSOR and is_torchvision_available(): + use_fast = True + logger.warning_once( + f"The image processor of type `{image_processor_type}` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. " + "This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. " + "Note that this behavior will be extended to all models in a future release." + ) if not use_fast: logger.warning_once( "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. " diff --git a/src/transformers/models/colqwen2/modular_colqwen2.py b/src/transformers/models/colqwen2/modular_colqwen2.py index 0fcdc09b7b..8e06d2ef32 100644 --- a/src/transformers/models/colqwen2/modular_colqwen2.py +++ b/src/transformers/models/colqwen2/modular_colqwen2.py @@ -67,7 +67,7 @@ class ColQwen2Processor(ColPaliProcessor): query_prefix (`str`, *optional*): A prefix to be used for the query. """ - image_processor_class = "Qwen2VLImageProcessor" + image_processor_class = "AutoImageProcessor" tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast") def __init__( diff --git a/src/transformers/models/colqwen2/processing_colqwen2.py b/src/transformers/models/colqwen2/processing_colqwen2.py index e283f57396..59af4bdd42 100644 --- a/src/transformers/models/colqwen2/processing_colqwen2.py +++ b/src/transformers/models/colqwen2/processing_colqwen2.py @@ -66,7 +66,7 @@ class ColQwen2Processor(ProcessorMixin): attributes = ["image_processor", "tokenizer"] - image_processor_class = "Qwen2VLImageProcessor" + image_processor_class = "AutoImageProcessor" tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast") def __init__( diff --git a/src/transformers/models/glm4v/image_processing_glm4v_fast.py b/src/transformers/models/glm4v/image_processing_glm4v_fast.py index a509be55a8..099384419e 100644 --- a/src/transformers/models/glm4v/image_processing_glm4v_fast.py +++ b/src/transformers/models/glm4v/image_processing_glm4v_fast.py @@ -138,6 +138,7 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast): processed_images_grouped = {} processed_grids = {} for shape, stacked_images in grouped_images.items(): + resized_height, resized_width = stacked_images.shape[-2:] # Fused rescale and normalize stacked_images = self.rescale_and_normalize( stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std @@ -188,9 +189,6 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast): images: ImageInput, **kwargs: Unpack[Glm4vFastImageProcessorKwargs], ) -> BatchFeature: - """ - Preprocess an image or batch of images. - """ return super().preprocess(images, **kwargs) diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py index 5aa5dd8887..27628e2f74 100644 --- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py @@ -35,9 +35,6 @@ from ...image_utils import ( ImageInput, PILImageResampling, SizeDict, - get_image_size, - make_flat_list_of_images, - valid_images, ) from ...processing_utils import Unpack from ...utils import ( @@ -57,8 +54,6 @@ if is_torch_available(): if is_torchvision_available(): - from ...image_utils import pil_torch_interpolation_mapping - if is_torchvision_v2_available(): from torchvision.transforms.v2 import functional as F else: @@ -110,18 +105,90 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast): size = kwargs.pop("size", None) min_pixels = kwargs.pop("min_pixels", None) max_pixels = kwargs.pop("max_pixels", None) - if size is not None and ("shortest_edge" not in size or "longest_edge" not in size): - raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.") - else: - size = self.size # backward compatibility: override size with min_pixels and max_pixels if they are provided + size = self.size if size is None else size if min_pixels is not None: size["shortest_edge"] = min_pixels + size.pop("min_pixels", None) if max_pixels is not None: size["longest_edge"] = max_pixels + size.pop("max_pixels", None) + if "shortest_edge" not in size or "longest_edge" not in size: + raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.") super().__init__(size=size, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs) + def _further_process_kwargs( + self, + size: Optional[SizeDict] = None, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, + **kwargs, + ) -> dict: + """ + Update kwargs that need further processing before being validated + Can be overridden by subclasses to customize the processing of kwargs. + """ + if min_pixels is not None and max_pixels is not None: + size = {"shortest_edge": min_pixels, "longest_edge": max_pixels} + elif size is not None: + if "shortest_edge" not in size or "longest_edge" not in size: + raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.") + min_pixels = size["shortest_edge"] + max_pixels = size["longest_edge"] + else: + size = {**self.size} + + return super()._further_process_kwargs(size=size, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs) + + @auto_docstring + def preprocess( + self, + images: ImageInput, + videos: Optional[VideoInput] = None, + **kwargs: Unpack[Qwen2VLFastImageProcessorKwargs], + ) -> BatchFeature: + return super().preprocess(images, videos, **kwargs) + + def _preprocess_image_like_inputs( + self, + images: ImageInput, + videos: VideoInput, + do_convert_rgb: bool, + input_data_format: ChannelDimension, + device: Optional[Union[str, "torch.device"]] = None, + **kwargs: Unpack[DefaultFastImageProcessorKwargs], + ) -> BatchFeature: + """ + Preprocess image-like inputs. + To be overriden by subclasses when image-like inputs other than images should be processed. + It can be used for segmentation maps, depth maps, etc. + """ + # Prepare input images + batch_feature = BatchFeature() + if images is not None: + images = self._prepare_image_like_inputs( + images=images, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device + ) + batch_feature = self._preprocess(images, **kwargs) + if videos is not None: + logger.warning( + "`Qwen2VLImageProcessorFast` works only with image inputs and doesn't process videos anymore. " + "This is a deprecated behavior and will be removed in v5.0. " + "Your videos should be forwarded to `Qwen2VLVideoProcessor`. " + ) + # Can't change _prepare_images_structure to work with videos because it also needs to work with images. + videos = make_batched_videos(videos) + videos = [ + torch.stack(self._prepare_image_like_inputs(video, do_convert_rgb, input_data_format, device)) + for video in videos + ] + video_outputs = self._preprocess(videos, **kwargs) + batch_feature.update( + {"pixel_values_videos": video_outputs.pixel_values, "video_grid_thw": video_outputs.image_grid_thw} + ) + return batch_feature + def _preprocess( self, images: list["torch.Tensor"], @@ -136,65 +203,15 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast): patch_size: int, temporal_patch_size: int, merge_size: int, - do_convert_rgb: bool, - input_data_format: Optional[Union[str, ChannelDimension]], - device: Optional[Union[str, torch.device]], disable_grouping: Optional[bool], + return_tensors: Optional[Union[str, TensorType]], + **kwargs, ): - """ - Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`. - - Args: - images (`ImageInput`): - Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`. - vision_info (`list[Dict]`, *optional*): - Optional list of dictionaries containing additional information about vision inputs. - do_resize (`bool`, *optional*, defaults to `self.do_resize`): - Whether to resize the image. - size (`dict[str, int]`, *optional*, defaults to `self.size`): - Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present. - interpolation (`InterpolationMode`): - Resampling filter to use if resizing the image. - do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): - Whether to rescale the image. - rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): - Scale factor to use if rescaling the image. - do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): - Whether to normalize the image. - image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): - Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image. - image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): - Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image. - patch_size (`int`, *optional*, defaults to `self.patch_size`): - The spatial patch size of the vision encoder. - temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`): - The temporal patch size of the vision encoder. - merge_size (`int`, *optional*, defaults to `self.merge_size`): - The merge size of the vision encoder to llm encoder. - do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): - Whether to convert the image to RGB. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format for the input image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - device (`torch.device`, *optional*): - The device to process the images on. If unset, the device is inferred from the input images. - """ - images = self._prepare_image_like_inputs( - images=images, - do_convert_rgb=do_convert_rgb, - input_data_format=input_data_format, - device=device, - ) - - height, width = get_image_size(images[0], channel_dim=ChannelDimension.FIRST) - resized_height, resized_width = height, width - # Group images by size for batched resizing grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) resized_images_grouped = {} for shape, stacked_images in grouped_images.items(): + height, width = stacked_images.shape[-2:] if do_resize: resized_height, resized_width = smart_resize( height, @@ -215,203 +232,63 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast): # Needed in case do_resize is False, or resize returns images with different sizes grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) processed_images_grouped = {} + processed_grids = {} for shape, stacked_images in grouped_images.items(): + resized_height, resized_width = stacked_images.shape[-2:] # Fused rescale and normalize - stacked_images = self.rescale_and_normalize( + patches = self.rescale_and_normalize( stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std ) - processed_images_grouped[shape] = stacked_images + if patches.ndim == 4: + # add a temporal dimension if we have images + patches = patches.unsqueeze(1) + if patches.shape[1] % temporal_patch_size != 0: + repeats = patches[:, -1:].repeat(1, temporal_patch_size - 1, 1, 1, 1) + patches = torch.cat([patches, repeats], dim=1) + batch_size, grid_t, channel = patches.shape[:3] + grid_t = grid_t // temporal_patch_size + grid_h, grid_w = resized_height // patch_size, resized_width // patch_size + + patches = patches.view( + batch_size, + grid_t, + temporal_patch_size, + channel, + grid_h // merge_size, + merge_size, + patch_size, + grid_w // merge_size, + merge_size, + patch_size, + ) + # Reorder dimensions to group grid and patch information for subsequent flattening. + # (batch, grid_t, grid_h, grid_w, merge_h, merge_w, channel, temp_patch_size, patch_h, patch_w) + patches = patches.permute(0, 1, 4, 7, 5, 8, 3, 2, 6, 9) + flatten_patches = patches.reshape( + batch_size, + grid_t * grid_h * grid_w, + channel * temporal_patch_size * patch_size * patch_size, + ) + + processed_images_grouped[shape] = flatten_patches + processed_grids[shape] = [[grid_t, grid_h, grid_w]] * batch_size processed_images = reorder_images(processed_images_grouped, grouped_images_index) - patches = torch.stack(processed_images, dim=0) - if patches.shape[0] % temporal_patch_size != 0: - repeats = patches[-1].unsqueeze(0).repeat(temporal_patch_size - 1, 1, 1, 1) - patches = torch.cat([patches, repeats], dim=0) + processed_grids = reorder_images(processed_grids, grouped_images_index) + pixel_values = torch.cat(processed_images, dim=0) + image_grid_thw = torch.tensor(processed_grids) - channel = patches.shape[1] - grid_t = patches.shape[0] // temporal_patch_size - grid_h, grid_w = resized_height // patch_size, resized_width // patch_size - - patches = patches.view( - grid_t, - temporal_patch_size, - channel, - grid_h // merge_size, - merge_size, - patch_size, - grid_w // merge_size, - merge_size, - patch_size, + return BatchFeature( + data={"pixel_values": pixel_values, "image_grid_thw": image_grid_thw}, tensor_type=return_tensors ) - patches = patches.permute(0, 3, 6, 4, 7, 2, 1, 5, 8) - flatten_patches = patches.reshape( - grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size - ) - - return flatten_patches, (grid_t, grid_h, grid_w) - - @auto_docstring - def preprocess( - self, - images: ImageInput, - videos: VideoInput = None, - do_resize: Optional[bool] = None, - size: Optional[dict[str, int]] = None, - resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]] = None, - do_rescale: Optional[bool] = None, - rescale_factor: Optional[float] = None, - do_normalize: Optional[bool] = None, - image_mean: Optional[Union[float, list[float]]] = None, - image_std: Optional[Union[float, list[float]]] = None, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - patch_size: Optional[int] = None, - temporal_patch_size: Optional[int] = None, - merge_size: Optional[int] = None, - do_convert_rgb: Optional[bool] = None, - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - device: Optional["torch.device"] = None, - disable_grouping: Optional[bool] = False, - **kwargs, - ): - r""" - min_pixels (`int`, *optional*, defaults to `56 * 56`): - The min pixels of the image to resize the image. - max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`): - The max pixels of the image to resize the image. - patch_size (`int`, *optional*, defaults to 14): - The spatial patch size of the vision encoder. - temporal_patch_size (`int`, *optional*, defaults to 2): - The temporal patch size of the vision encoder. - merge_size (`int`, *optional*, defaults to 2): - The merge size of the vision encoder to llm encoder. - """ - min_pixels = min_pixels if min_pixels is not None else self.min_pixels - max_pixels = max_pixels if max_pixels is not None else self.max_pixels - - if size is not None: - if "shortest_edge" not in size or "longest_edge" not in size: - raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.") - min_pixels = size["shortest_edge"] - elif min_pixels is not None and max_pixels is not None: - # backward compatibility: override size with min_pixels and max_pixels if they are provided - size = {"shortest_edge": min_pixels, "longest_edge": max_pixels} - else: - size = {**self.size} - - do_resize = do_resize if do_resize is not None else self.do_resize - size = size if size is not None else self.size - resample = resample if resample is not None else self.resample - do_rescale = do_rescale if do_rescale is not None else self.do_rescale - rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor - do_normalize = do_normalize if do_normalize is not None else self.do_normalize - image_mean = image_mean if image_mean is not None else self.image_mean - image_std = image_std if image_std is not None else self.image_std - patch_size = patch_size if patch_size is not None else self.patch_size - temporal_patch_size = temporal_patch_size if temporal_patch_size is not None else self.temporal_patch_size - merge_size = merge_size if merge_size is not None else self.merge_size - do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb - - # Make hashable for cache - size = SizeDict(**size) if size is not None else None - image_mean = tuple(image_mean) if image_mean is not None else None - image_std = tuple(image_std) if image_std is not None else None - - self._validate_preprocess_kwargs( - do_rescale=do_rescale, - rescale_factor=rescale_factor, - do_normalize=do_normalize, - image_mean=image_mean, - image_std=image_std, - do_resize=do_resize, - size=size, - resample=resample, - return_tensors=return_tensors, - data_format=data_format, - ) - interpolation = ( - pil_torch_interpolation_mapping[resample] if isinstance(resample, (PILImageResampling, int)) else resample - ) - - if images is not None: - images = make_flat_list_of_images(images) - - if images is not None and not valid_images(images): - raise ValueError( - "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " - "torch.Tensor, tf.Tensor or jax.ndarray." - ) - - data = {} - if images is not None: - pixel_values, vision_grid_thws = [], [] - for image in images: - patches, image_grid_thw = self._preprocess( - image, - do_resize=do_resize, - size=size, - interpolation=interpolation, - do_rescale=do_rescale, - rescale_factor=rescale_factor, - do_normalize=do_normalize, - image_mean=image_mean, - image_std=image_std, - patch_size=patch_size, - temporal_patch_size=temporal_patch_size, - merge_size=merge_size, - do_convert_rgb=do_convert_rgb, - input_data_format=input_data_format, - device=device, - disable_grouping=disable_grouping, - ) - pixel_values.extend(patches) - vision_grid_thws.append(image_grid_thw) - pixel_values = torch.stack(pixel_values) - vision_grid_thws = torch.tensor(vision_grid_thws) - data.update({"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}) - - # kept for BC only and should be removed after v5.0 - if videos is not None: - logger.warning( - "`Qwen2VLImageProcessorFast` works only with image inputs and doesn't process videos anymore. " - "This is a deprecated behavior and will be removed in v5.0. " - "Your videos should be forwarded to `Qwen2VLVideoProcessor`. " - ) - videos = make_batched_videos(videos) - pixel_values_videos, vision_grid_thws_videos = [], [] - for images in videos: - patches, video_grid_thw = self._preprocess( - images, - do_resize=do_resize, - size=size, - interpolation=interpolation, - do_rescale=do_rescale, - rescale_factor=rescale_factor, - do_normalize=do_normalize, - image_mean=image_mean, - image_std=image_std, - patch_size=patch_size, - temporal_patch_size=temporal_patch_size, - merge_size=merge_size, - do_convert_rgb=do_convert_rgb, - input_data_format=input_data_format, - device=device, - disable_grouping=disable_grouping, - ) - pixel_values_videos.extend(patches) - vision_grid_thws_videos.append(video_grid_thw) - pixel_values_videos = torch.stack(pixel_values_videos) - vision_grid_thws_videos = torch.tensor(vision_grid_thws_videos) - data.update({"pixel_values_videos": pixel_values_videos, "video_grid_thw": vision_grid_thws_videos}) - - return BatchFeature(data=data, tensor_type=return_tensors) def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None): """ A utility that returns number of image patches for a given image size. + Note: Do not remove this method! It is used by vLLM to infer the number of patches and placeholders + without an image input. + Args: height (`int`): Height of the input image. diff --git a/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py index 2f34b9df7b..6c62a568f6 100644 --- a/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py @@ -116,8 +116,21 @@ class Qwen2VLVideoProcessor(BaseVideoProcessor): model_input_names = ["pixel_values_videos", "video_grid_thw"] def __init__(self, **kwargs: Unpack[Qwen2VLVideoProcessorInitKwargs]): - super().__init__(**kwargs) - self.size = {"shortest_edge": self.min_pixels, "longest_edge": self.max_pixels} + size = kwargs.pop("size", None) + min_pixels = kwargs.pop("min_pixels", None) + max_pixels = kwargs.pop("max_pixels", None) + # backward compatibility: override size with min_pixels and max_pixels if they are provided + size = self.size if size is None else size + if min_pixels is not None: + size["shortest_edge"] = min_pixels + size.pop("min_pixels", None) + if max_pixels is not None: + size["longest_edge"] = max_pixels + size.pop("max_pixels", None) + if "shortest_edge" not in size or "longest_edge" not in size: + raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.") + + super().__init__(size=size, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs) def sample_frames( self, diff --git a/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py b/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py index 493d08c0d4..0baea494cd 100644 --- a/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py +++ b/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py @@ -25,10 +25,17 @@ from huggingface_hub import hf_hub_download from transformers import ( AutoProcessor, Qwen2_5OmniProcessor, - Qwen2Tokenizer, + Qwen2TokenizerFast, WhisperFeatureExtractor, ) -from transformers.testing_utils import require_av, require_librosa, require_torch, require_torchaudio, require_vision +from transformers.testing_utils import ( + require_av, + require_librosa, + require_torch, + require_torchaudio, + require_torchvision, + require_vision, +) from transformers.utils import is_torch_available, is_vision_available from ...test_processing_common import ProcessorTesterMixin @@ -38,12 +45,13 @@ if is_torch_available(): import torch if is_vision_available(): - from transformers import Qwen2VLImageProcessor + from transformers import Qwen2VLImageProcessorFast @require_vision @require_torch @require_torchaudio +@require_torchvision class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = Qwen2_5OmniProcessor @@ -244,13 +252,13 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase): ) processor.save_pretrained(self.tmpdirname) - processor = Qwen2_5OmniProcessor.from_pretrained(self.tmpdirname, use_fast=False) + processor = Qwen2_5OmniProcessor.from_pretrained(self.tmpdirname, use_fast=True) self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab()) self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string()) self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string()) - self.assertIsInstance(processor.tokenizer, Qwen2Tokenizer) - self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessor) + self.assertIsInstance(processor.tokenizer, Qwen2TokenizerFast) + self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessorFast) self.assertIsInstance(processor.feature_extractor, WhisperFeatureExtractor) def test_image_processor(self): @@ -267,8 +275,8 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase): image_input = self.prepare_image_inputs() - input_image_proc = image_processor(image_input, return_tensors="np") - input_processor = processor(images=image_input, text="dummy", return_tensors="np") + input_image_proc = image_processor(image_input, return_tensors="pt") + input_processor = processor(images=image_input, text="dummy", return_tensors="pt") for key in input_image_proc.keys(): self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2) diff --git a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py index 930b96e555..b8aa49b004 100644 --- a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py +++ b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py @@ -20,15 +20,15 @@ import unittest import numpy as np import pytest -from transformers import AutoProcessor, Qwen2Tokenizer -from transformers.testing_utils import require_av, require_torch, require_vision +from transformers import AutoProcessor, Qwen2TokenizerFast +from transformers.testing_utils import require_av, require_torch, require_torchvision, require_vision from transformers.utils import is_torch_available, is_vision_available from ...test_processing_common import ProcessorTesterMixin if is_vision_available(): - from transformers import Qwen2_5_VLProcessor, Qwen2VLImageProcessor + from transformers import Qwen2_5_VLProcessor, Qwen2VLImageProcessorFast if is_torch_available(): import torch @@ -36,6 +36,7 @@ if is_torch_available(): @require_vision @require_torch +@require_torchvision class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = Qwen2_5_VLProcessor @@ -73,12 +74,12 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase): tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor ) processor.save_pretrained(self.tmpdirname) - processor = Qwen2_5_VLProcessor.from_pretrained(self.tmpdirname, use_fast=False) + processor = Qwen2_5_VLProcessor.from_pretrained(self.tmpdirname, use_fast=True) self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab()) self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string()) - self.assertIsInstance(processor.tokenizer, Qwen2Tokenizer) - self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessor) + self.assertIsInstance(processor.tokenizer, Qwen2TokenizerFast) + self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessorFast) def test_image_processor(self): image_processor = self.get_image_processor() @@ -91,8 +92,8 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase): image_input = self.prepare_image_inputs() - input_image_proc = image_processor(image_input, return_tensors="np") - input_processor = processor(images=image_input, text="dummy", return_tensors="np") + input_image_proc = image_processor(image_input, return_tensors="pt") + input_processor = processor(images=image_input, text="dummy", return_tensors="pt") for key in input_image_proc.keys(): self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2) diff --git a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py index f671f8f130..6ff2fa70c0 100644 --- a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py @@ -22,7 +22,7 @@ import requests from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize from transformers.testing_utils import require_torch, require_vision -from transformers.utils import is_torch_available, is_vision_available +from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs, prepare_video_inputs @@ -35,8 +35,8 @@ if is_vision_available(): from transformers import Qwen2VLImageProcessor - # if is_torchvision_available(): - # from transformers import Qwen2VLImageProcessorFast + if is_torchvision_available(): + from transformers import Qwen2VLImageProcessorFast class Qwen2VLImageProcessingTester: @@ -119,7 +119,7 @@ class Qwen2VLImageProcessingTester: @require_vision class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): image_processing_class = Qwen2VLImageProcessor if is_vision_available() else None - # fast_image_processing_class = Qwen2VLImageProcessorFast if is_torchvision_available() else None + fast_image_processing_class = Qwen2VLImageProcessorFast if is_torchvision_available() else None def setUp(self): super().setUp() @@ -363,3 +363,34 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): encoding_fast = image_processor_fast(dummy_image, return_tensors="pt") self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values) + self.assertEqual(encoding_slow.image_grid_thw.dtype, encoding_fast.image_grid_thw.dtype) + self._assert_slow_fast_tensors_equivalence( + encoding_slow.image_grid_thw.float(), encoding_fast.image_grid_thw.float() + ) + + @require_vision + @require_torch + def test_slow_fast_equivalence_batched(self): + if not self.test_slow_image_processor or not self.test_fast_image_processor: + self.skipTest(reason="Skipping slow/fast equivalence test") + + if self.image_processing_class is None or self.fast_image_processing_class is None: + self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined") + + if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop: + self.skipTest( + reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors" + ) + + dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True) + image_processor_slow = self.image_processing_class(**self.image_processor_dict) + image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict) + + encoding_slow = image_processor_slow(dummy_images, return_tensors="pt") + encoding_fast = image_processor_fast(dummy_images, return_tensors="pt") + + self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values) + self.assertEqual(encoding_slow.image_grid_thw.dtype, encoding_fast.image_grid_thw.dtype) + self._assert_slow_fast_tensors_equivalence( + encoding_slow.image_grid_thw.float(), encoding_fast.image_grid_thw.float() + ) diff --git a/tests/models/qwen2_vl/test_processor_qwen2_vl.py b/tests/models/qwen2_vl/test_processor_qwen2_vl.py index b8181ff5b3..eb5fdc79d0 100644 --- a/tests/models/qwen2_vl/test_processor_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_processor_qwen2_vl.py @@ -20,18 +20,18 @@ import unittest import numpy as np import pytest -from transformers import AutoProcessor, Qwen2Tokenizer -from transformers.testing_utils import require_av, require_torch, require_vision +from transformers import AutoProcessor, Qwen2TokenizerFast +from transformers.testing_utils import require_av, require_torch, require_torchvision, require_vision from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available from ...test_processing_common import ProcessorTesterMixin if is_vision_available(): - from transformers import Qwen2VLImageProcessor, Qwen2VLProcessor + from transformers import Qwen2VLProcessor if is_torchvision_available(): - from transformers import Qwen2VLVideoProcessor + from transformers import Qwen2VLImageProcessorFast, Qwen2VLVideoProcessor if is_torch_available(): import torch @@ -39,6 +39,7 @@ if is_torch_available(): @require_vision @require_torch +@require_torchvision class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = Qwen2VLProcessor @@ -76,12 +77,12 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase): tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor ) processor.save_pretrained(self.tmpdirname) - processor = Qwen2VLProcessor.from_pretrained(self.tmpdirname, use_fast=False) + processor = Qwen2VLProcessor.from_pretrained(self.tmpdirname, use_fast=True) self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab()) self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string()) - self.assertIsInstance(processor.tokenizer, Qwen2Tokenizer) - self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessor) + self.assertIsInstance(processor.tokenizer, Qwen2TokenizerFast) + self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessorFast) self.assertIsInstance(processor.video_processor, Qwen2VLVideoProcessor) def test_image_processor(self): @@ -95,8 +96,8 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase): image_input = self.prepare_image_inputs() - input_image_proc = image_processor(image_input, return_tensors="np") - input_processor = processor(images=image_input, text="dummy", return_tensors="np") + input_image_proc = image_processor(image_input, return_tensors="pt") + input_processor = processor(images=image_input, text="dummy", return_tensors="pt") for key in input_image_proc.keys(): self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index 2bb5b9c847..c9b9b09cbb 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -937,7 +937,7 @@ class ProcessorTesterMixin: "video", batch_size, return_tensors, "videos_input_name", "video_processor", MODALITY_INPUT_DATA["videos"] ) - @parameterized.expand([(1, "np"), (1, "pt"), (2, "np"), (2, "pt")]) + @parameterized.expand([(1, "pt"), (2, "pt")]) # fast image processors supports only torchvision def test_apply_chat_template_image(self, batch_size: int, return_tensors: str): self._test_apply_chat_template( "image", batch_size, return_tensors, "images_input_name", "image_processor", MODALITY_INPUT_DATA["images"]