From 8b237b86398e108447427825703f7a80780785aa Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Mon, 28 Jul 2025 11:41:58 +0200 Subject: [PATCH] [processors] add tests for helper fn (#39629) * add tests for helpers * duplicate test for each model * why llava next video has no helper * oops must have been in the commit * fix test after rebase * add copy from --- .../models/aria/image_processing_aria.py | 4 +- src/transformers/models/aria/modular_aria.py | 4 +- .../models/colpali/processing_colpali.py | 5 +-- .../models/colqwen2/modular_colqwen2.py | 28 ++++++++++++- .../models/colqwen2/processing_colqwen2.py | 21 ++++++---- .../models/glm4v/image_processing_glm4v.py | 4 +- .../got_ocr2/image_processing_got_ocr2.py | 10 +++-- .../image_processing_got_ocr2_fast.py | 12 +++--- .../idefics3/image_processing_idefics3.py | 8 ++-- .../image_processing_idefics3_fast.py | 8 ++-- .../models/internvl/processing_internvl.py | 2 +- .../llava_next/processing_llava_next.py | 9 +---- .../processing_llava_next_video.py | 39 ++++++++++++++++++- .../models/paligemma/processing_paligemma.py | 5 +-- .../qwen2_vl/image_processing_qwen2_vl.py | 8 ++-- .../image_processing_qwen2_vl_fast.py | 8 ++-- .../smolvlm/image_processing_smolvlm.py | 8 ++-- .../smolvlm/image_processing_smolvlm_fast.py | 8 ++-- .../models/aria/test_image_processing_aria.py | 16 ++++++++ tests/models/aria/test_processor_aria.py | 13 +++++++ .../aya_vision/test_processor_aya_vision.py | 13 +++++++ .../chameleon/test_processor_chameleon.py | 13 +++++++ .../models/colpali/test_processing_colpali.py | 13 +++++++ .../colqwen2/test_processing_colqwen2.py | 13 +++++++ tests/models/emu3/test_processor_emu3.py | 13 +++++++ tests/models/fuyu/test_processor_fuyu.py | 13 +++++++ tests/models/gemma3/test_processing_gemma3.py | 13 +++++++ .../test_image_processing_got_ocr2.py | 21 ++++++++++ .../test_image_processing_idefics3.py | 25 ++++++++++++ .../idefics3/test_processor_idefics3.py | 13 +++++++ .../internvl/test_processor_internvl.py | 13 +++++++ tests/models/llava/test_processor_llava.py | 12 ++++++ .../llava_next/test_processor_llava_next.py | 13 +++++++ .../test_processor_llava_next_video.py | 13 +++++++ .../test_processor_llava_onevision.py | 13 +++++++ .../paligemma/test_processor_paligemma.py | 13 +++++++ .../qwen2_5_vl/test_processor_qwen2_5_vl.py | 13 +++++++ .../test_image_processing_qwen2_vl.py | 14 +++++++ .../qwen2_vl/test_processor_qwen2_vl.py | 13 +++++++ .../smolvlm/test_image_processing_smolvlm.py | 25 ++++++++++++ 40 files changed, 454 insertions(+), 58 deletions(-) diff --git a/src/transformers/models/aria/image_processing_aria.py b/src/transformers/models/aria/image_processing_aria.py index d2b6c21a7f..4d0ae92dd0 100644 --- a/src/transformers/models/aria/image_processing_aria.py +++ b/src/transformers/models/aria/image_processing_aria.py @@ -515,8 +515,8 @@ class AriaImageProcessor(BaseImageProcessor): Returns: `int`: Number of patches per image. """ - split_image = images_kwargs.get("split_image", None) or self.split_image - max_image_size = images_kwargs.get("max_image_size", None) or self.max_image_size + split_image = images_kwargs["split_image"] if "split_image" in images_kwargs else self.split_image + max_image_size = images_kwargs["max_image_size"] if "max_image_size" in images_kwargs else self.max_image_size resized_height, resized_width = select_best_resolution((height, width), self.split_resolutions) num_patches = 1 if not split_image else resized_height // max_image_size * resized_width // max_image_size diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py index d980898460..a531bc43b3 100644 --- a/src/transformers/models/aria/modular_aria.py +++ b/src/transformers/models/aria/modular_aria.py @@ -901,8 +901,8 @@ class AriaImageProcessor(BaseImageProcessor): Returns: `int`: Number of patches per image. """ - split_image = images_kwargs.get("split_image", None) or self.split_image - max_image_size = images_kwargs.get("max_image_size", None) or self.max_image_size + split_image = images_kwargs["split_image"] if "split_image" in images_kwargs else self.split_image + max_image_size = images_kwargs["max_image_size"] if "max_image_size" in images_kwargs else self.max_image_size resized_height, resized_width = select_best_resolution((height, width), self.split_resolutions) num_patches = 1 if not split_image else resized_height // max_image_size * resized_width // max_image_size diff --git a/src/transformers/models/colpali/processing_colpali.py b/src/transformers/models/colpali/processing_colpali.py index 759209beac..e0d8118d44 100644 --- a/src/transformers/models/colpali/processing_colpali.py +++ b/src/transformers/models/colpali/processing_colpali.py @@ -264,9 +264,8 @@ class ColPaliProcessor(ProcessorMixin): image_sizes (list[list[str]], *optional*): The input sizes formatted as (height, width) per each image. Returns: - dict[str, list[int]]: A dictionary mapping each modality ("image", "video", "audio") - to a list containing the number of placeholder tokens required. If the model doesn't accept - a certain modality or no input sizes are provided, the dict value is set to an empty list. + `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided + input modalities, along with other useful data. """ vision_data = {} if image_sizes is not None: diff --git a/src/transformers/models/colqwen2/modular_colqwen2.py b/src/transformers/models/colqwen2/modular_colqwen2.py index 8e06d2ef32..5c7bfb2dc0 100644 --- a/src/transformers/models/colqwen2/modular_colqwen2.py +++ b/src/transformers/models/colqwen2/modular_colqwen2.py @@ -22,7 +22,7 @@ from transformers.models.colpali.processing_colpali import ColPaliProcessor from ...cache_utils import Cache from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput, is_valid_image -from ...processing_utils import ProcessingKwargs, Unpack +from ...processing_utils import MultiModalData, ProcessingKwargs, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import ModelOutput, auto_docstring, can_return_tuple, is_torch_available, logging from .configuration_colqwen2 import ColQwen2Config @@ -224,6 +224,32 @@ class ColQwen2Processor(ColPaliProcessor): return batch_query + def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs): + """ + Computes the number of placeholder tokens needed for multimodal inputs with the given sizes. + Args: + image_sizes (`list[list[int]]`, *optional*): + The input sizes formatted as (height, width) per each image. + Returns: + `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided + input modalities, along with other useful data. + """ + + vision_data = {} + if image_sizes is not None: + images_kwargs = ColQwen2ProcessorKwargs._defaults.get("images_kwargs", {}) + images_kwargs.update(kwargs) + merge_size = images_kwargs.get("merge_size", None) or self.image_processor.merge_size + + num_image_patches = [ + self.image_processor.get_number_of_image_patches(*image_size, images_kwargs) + for image_size in image_sizes + ] + num_image_tokens = [(num_patches // merge_size**2) for num_patches in num_image_patches] + vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches}) + + return MultiModalData(**vision_data) + class ColQwen2PreTrainedModel(ColPaliPreTrainedModel): pass diff --git a/src/transformers/models/colqwen2/processing_colqwen2.py b/src/transformers/models/colqwen2/processing_colqwen2.py index 59af4bdd42..a8b99380ac 100644 --- a/src/transformers/models/colqwen2/processing_colqwen2.py +++ b/src/transformers/models/colqwen2/processing_colqwen2.py @@ -226,20 +226,27 @@ class ColQwen2Processor(ProcessorMixin): def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs): """ Computes the number of placeholder tokens needed for multimodal inputs with the given sizes. - Args: - image_sizes (list[list[str]], *optional*): + image_sizes (`list[list[int]]`, *optional*): The input sizes formatted as (height, width) per each image. Returns: - dict[str, list[int]]: A dictionary mapping each modality ("image", "video", "audio") - to a list containing the number of placeholder tokens required. If the model doesn't accept - a certain modality or no input sizes are provided, the dict value is set to an empty list. + `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided + input modalities, along with other useful data. """ + vision_data = {} if image_sizes is not None: - num_image_tokens = [self.image_seq_length] * len(image_sizes) - num_image_patches = [1] * len(image_sizes) + images_kwargs = ColQwen2ProcessorKwargs._defaults.get("images_kwargs", {}) + images_kwargs.update(kwargs) + merge_size = images_kwargs.get("merge_size", None) or self.image_processor.merge_size + + num_image_patches = [ + self.image_processor.get_number_of_image_patches(*image_size, images_kwargs) + for image_size in image_sizes + ] + num_image_tokens = [(num_patches // merge_size**2) for num_patches in num_image_patches] vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches}) + return MultiModalData(**vision_data) def batch_decode(self, *args, **kwargs): diff --git a/src/transformers/models/glm4v/image_processing_glm4v.py b/src/transformers/models/glm4v/image_processing_glm4v.py index d991a09548..2b4f9aa24b 100644 --- a/src/transformers/models/glm4v/image_processing_glm4v.py +++ b/src/transformers/models/glm4v/image_processing_glm4v.py @@ -449,8 +449,8 @@ class Glm4vImageProcessor(BaseImageProcessor): Returns: `int`: Number of image patches per image. """ - patch_size = images_kwargs.get("patch_size", None) or self.patch_size - merge_size = images_kwargs.get("merge_size", None) or self.merge_size + patch_size = images_kwargs["patch_size"] if "patch_size" in images_kwargs else self.patch_size + merge_size = images_kwargs["merge_size"] if "merge_size" in images_kwargs else self.merge_size factor = patch_size * merge_size resized_height, resized_width = smart_resize( diff --git a/src/transformers/models/got_ocr2/image_processing_got_ocr2.py b/src/transformers/models/got_ocr2/image_processing_got_ocr2.py index f2698310ed..a1f0eca4cc 100644 --- a/src/transformers/models/got_ocr2/image_processing_got_ocr2.py +++ b/src/transformers/models/got_ocr2/image_processing_got_ocr2.py @@ -505,10 +505,12 @@ class GotOcr2ImageProcessor(BaseImageProcessor): Returns: `int`: Number of patches per image. """ - min_patches = images_kwargs.get("min_patches", None) or self.min_patches - max_patches = images_kwargs.get("max_patches", None) or self.max_patches - patch_size = images_kwargs.get("size", None) or self.size - crop_to_patches = images_kwargs.get("crop_to_patches", None) or self.crop_to_patches + min_patches = images_kwargs["min_patches"] if "min_patches" in images_kwargs else self.min_patches + max_patches = images_kwargs["max_patches"] if "max_patches" in images_kwargs else self.max_patches + patch_size = images_kwargs["patch_size"] if "patch_size" in images_kwargs else self.size + crop_to_patches = ( + images_kwargs["crop_to_patches"] if "crop_to_patches" in images_kwargs else self.crop_to_patches + ) num_patches = 1 if crop_to_patches and max_patches > 1: diff --git a/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py b/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py index 452b4c3f58..04cf09fe39 100644 --- a/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +++ b/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py @@ -223,7 +223,7 @@ class GotOcr2ImageProcessorFast(BaseImageProcessorFast): data={"pixel_values": processed_images, "num_patches": num_patches}, tensor_type=return_tensors ) - def get_number_of_image_tokens(self, height: int, width: int, images_kwargs=None): + def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None): """ A utility that returns number patches for a given image size. @@ -237,10 +237,12 @@ class GotOcr2ImageProcessorFast(BaseImageProcessorFast): Returns: `int`: Number of patches per image. """ - min_patches = images_kwargs.get("min_patches", None) or self.min_patches - max_patches = images_kwargs.get("max_patches", None) or self.max_patches - patch_size = images_kwargs.get("size", None) or self.size - crop_to_patches = images_kwargs.get("crop_to_patches", None) or self.crop_to_patches + min_patches = images_kwargs["min_patches"] if "min_patches" in images_kwargs else self.min_patches + max_patches = images_kwargs["max_patches"] if "max_patches" in images_kwargs else self.max_patches + patch_size = images_kwargs["patch_size"] if "patch_size" in images_kwargs else self.size + crop_to_patches = ( + images_kwargs["crop_to_patches"] if "crop_to_patches" in images_kwargs else self.crop_to_patches + ) num_patches = 1 if crop_to_patches and max_patches > 1: diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py index 65ad87dae4..194dd092bb 100644 --- a/src/transformers/models/idefics3/image_processing_idefics3.py +++ b/src/transformers/models/idefics3/image_processing_idefics3.py @@ -866,9 +866,11 @@ class Idefics3ImageProcessor(BaseImageProcessor): Returns: `int`: Number of patches per image. """ - do_image_splitting = images_kwargs.get("do_image_splitting", None) or self.do_image_splitting - max_image_size = images_kwargs.get("max_image_size", None) or self.max_image_size - size = images_kwargs.get("size", None) or self.size + do_image_splitting = ( + images_kwargs["do_image_splitting"] if "do_image_splitting" in images_kwargs else self.do_image_splitting + ) + max_image_size = images_kwargs["max_image_size"] if "max_image_size" in images_kwargs else self.max_image_size + size = images_kwargs["size"] if "size" in images_kwargs else self.size num_patches = num_rows = num_cols = 1 if do_image_splitting: diff --git a/src/transformers/models/idefics3/image_processing_idefics3_fast.py b/src/transformers/models/idefics3/image_processing_idefics3_fast.py index 8fdef6e378..a2251e7853 100644 --- a/src/transformers/models/idefics3/image_processing_idefics3_fast.py +++ b/src/transformers/models/idefics3/image_processing_idefics3_fast.py @@ -514,9 +514,11 @@ class Idefics3ImageProcessorFast(BaseImageProcessorFast): Returns: `int`: Number of patches per image. """ - do_image_splitting = images_kwargs.get("do_image_splitting", None) or self.do_image_splitting - max_image_size = images_kwargs.get("max_image_size", None) or self.max_image_size - size = images_kwargs.get("size", None) or self.size + do_image_splitting = ( + images_kwargs["do_image_splitting"] if "do_image_splitting" in images_kwargs else self.do_image_splitting + ) + max_image_size = images_kwargs["max_image_size"] if "max_image_size" in images_kwargs else self.max_image_size + size = images_kwargs["size"] if "size" in images_kwargs else self.size num_patches = num_rows = num_cols = 1 if do_image_splitting: diff --git a/src/transformers/models/internvl/processing_internvl.py b/src/transformers/models/internvl/processing_internvl.py index 0193af7080..61f6f04482 100644 --- a/src/transformers/models/internvl/processing_internvl.py +++ b/src/transformers/models/internvl/processing_internvl.py @@ -284,7 +284,7 @@ class InternVLProcessor(ProcessorMixin): images_kwargs.update(kwargs) num_image_patches = [ - self.image_processor.get_number_of_image_tokens(*image_size, images_kwargs) + self.image_processor.get_number_of_image_patches(*image_size, images_kwargs) for image_size in image_sizes ] # Add 2 for BOI and EOI tokens diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py index c5760aa169..92d8d56618 100644 --- a/src/transformers/models/llava_next/processing_llava_next.py +++ b/src/transformers/models/llava_next/processing_llava_next.py @@ -231,14 +231,9 @@ class LlavaNextProcessor(ProcessorMixin): Args: image_sizes (list[list[str]], *optional*): The input sizes formatted as (height, width) per each image. - video_sizes (list[list[str]], *optional*): - The input sizes formatted as (num_frames, height, width) per each video. - audio_lengths (list[int], *optional*): - The input length formatted as per each audio. Returns: - dict[str, list[int]]: A dictionary mapping each modality ("image", "video", "audio") - to a list containing the number of placeholder tokens required. If the model doesn't accept - a certain modality or no input sizes are provided, the dict value is set to an empty list. + `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided + input modalities, along with other useful data. """ vision_data = {} if image_sizes is not None: diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py index b50aee5af2..e04c968c19 100644 --- a/src/transformers/models/llava_next_video/processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py @@ -23,7 +23,7 @@ import numpy as np from ...feature_extraction_utils import BatchFeature from ...image_processing_utils import select_best_resolution from ...image_utils import ImageInput, get_image_size, to_numpy_array -from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import logging from ...video_utils import VideoInput @@ -265,6 +265,43 @@ class LlavaNextVideoProcessor(ProcessorMixin): newline_features = current_height return (unpadded_features, newline_features) + def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs): + """ + Computes the number of placeholder tokens needed for multimodal inputs with the given sizes. + Args: + image_sizes (list[list[str]], *optional*): + The input sizes formatted as (height, width) per each image. + Returns: + `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided + input modalities, along with other useful data. + """ + vision_data = {} + if image_sizes is not None: + images_kwargs = LlavaNextVideoProcessorKwargs._defaults.get("images_kwargs", {}) + images_kwargs.update(kwargs) + + size = images_kwargs.get("size", None) or self.image_processor.size + size = ( + (size["shortest_edge"], size["shortest_edge"]) + if "shortest_edge" in size + else (min(size["height"], size["width"]), min(size["height"], size["width"])) + ) + processed_height, processed_width = size + + batch_num_image_tokens = [] + num_image_patches = [1] * len(image_sizes) # llava-next doesn't batch pixels as Idefics, thus `1` patch` + for image_size in image_sizes: + orig_height, orig_width = image_size + num_image_tokens = self._get_number_of_features( + orig_height, orig_width, processed_height, processed_width + ) + if self.vision_feature_select_strategy == "default": + num_image_tokens -= 1 + batch_num_image_tokens.append(num_image_tokens) + vision_data.update({"num_image_tokens": batch_num_image_tokens, "num_image_patches": num_image_patches}) + + return MultiModalData(**vision_data) + # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/paligemma/processing_paligemma.py b/src/transformers/models/paligemma/processing_paligemma.py index b4c8e555b5..e9629b2d2f 100644 --- a/src/transformers/models/paligemma/processing_paligemma.py +++ b/src/transformers/models/paligemma/processing_paligemma.py @@ -327,9 +327,8 @@ class PaliGemmaProcessor(ProcessorMixin): image_sizes (list[list[str]], *optional*): The input sizes formatted as (height, width) per each image. Returns: - dict[str, list[int]]: A dictionary mapping each modality ("image", "video", "audio") - to a list containing the number of placeholder tokens required. If the model doesn't accept - a certain modality or no input sizes are provided, the dict value is set to an empty list. + `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided + input modalities, along with other useful data. """ vision_data = {} if image_sizes is not None: diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py index 95b303346c..e42eeeef20 100644 --- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py @@ -502,10 +502,10 @@ class Qwen2VLImageProcessor(BaseImageProcessor): Returns: `int`: Number of image patches per image. """ - min_pixels = images_kwargs.get("min_pixels", None) or self.size["shortest_edge"] - max_pixels = images_kwargs.get("max_pixels", None) or self.size["longest_edge"] - patch_size = images_kwargs.get("patch_size", None) or self.patch_size - merge_size = images_kwargs.get("merge_size", None) or self.merge_size + min_pixels = images_kwargs["min_pixels"] if "min_pixels" in images_kwargs else self.size["shortest_edge"] + max_pixels = images_kwargs["max_pixels"] if "max_pixels" in images_kwargs else self.size["longest_edge"] + patch_size = images_kwargs["patch_size"] if "patch_size" in images_kwargs else self.patch_size + merge_size = images_kwargs["merge_size"] if "merge_size" in images_kwargs else self.merge_size factor = patch_size * merge_size resized_height, resized_width = smart_resize( diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py index 27628e2f74..cadecfbf3f 100644 --- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py @@ -299,10 +299,10 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast): Returns: `int`: Number of image patches per image. """ - min_pixels = images_kwargs.get("min_pixels", None) or self.size["shortest_edge"] - max_pixels = images_kwargs.get("max_pixels", None) or self.size["longest_edge"] - patch_size = images_kwargs.get("patch_size", None) or self.patch_size - merge_size = images_kwargs.get("merge_size", None) or self.merge_size + min_pixels = images_kwargs["min_pixels"] if "min_pixels" in images_kwargs else self.size["shortest_edge"] + max_pixels = images_kwargs["max_pixels"] if "max_pixels" in images_kwargs else self.size["longest_edge"] + patch_size = images_kwargs["patch_size"] if "patch_size" in images_kwargs else self.patch_size + merge_size = images_kwargs["merge_size"] if "merge_size" in images_kwargs else self.merge_size factor = patch_size * merge_size resized_height, resized_width = smart_resize( diff --git a/src/transformers/models/smolvlm/image_processing_smolvlm.py b/src/transformers/models/smolvlm/image_processing_smolvlm.py index 431a6f32bb..440f263d0a 100644 --- a/src/transformers/models/smolvlm/image_processing_smolvlm.py +++ b/src/transformers/models/smolvlm/image_processing_smolvlm.py @@ -863,9 +863,11 @@ class SmolVLMImageProcessor(BaseImageProcessor): Returns: `int`: Number of patches per image. """ - do_image_splitting = images_kwargs.get("do_image_splitting", None) or self.do_image_splitting - max_image_size = images_kwargs.get("max_image_size", None) or self.max_image_size - size = images_kwargs.get("size", None) or self.size + do_image_splitting = ( + images_kwargs["do_image_splitting"] if "do_image_splitting" in images_kwargs else self.do_image_splitting + ) + max_image_size = images_kwargs["max_image_size"] if "max_image_size" in images_kwargs else self.max_image_size + size = images_kwargs["size"] if "size" in images_kwargs else self.size num_patches = num_rows = num_cols = 1 if do_image_splitting: diff --git a/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py b/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py index 1cfca31306..ecbd3a7e07 100644 --- a/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py +++ b/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py @@ -504,9 +504,11 @@ class SmolVLMImageProcessorFast(BaseImageProcessorFast): Returns: `int`: Number of patches per image. """ - do_image_splitting = images_kwargs.get("do_image_splitting", None) or self.do_image_splitting - max_image_size = images_kwargs.get("max_image_size", None) or self.max_image_size - size = images_kwargs.get("size", None) or self.size + do_image_splitting = ( + images_kwargs["do_image_splitting"] if "do_image_splitting" in images_kwargs else self.do_image_splitting + ) + max_image_size = images_kwargs["max_image_size"] if "max_image_size" in images_kwargs else self.max_image_size + size = images_kwargs["size"] if "size" in images_kwargs else self.size num_patches = num_rows = num_cols = 1 if do_image_splitting: diff --git a/tests/models/aria/test_image_processing_aria.py b/tests/models/aria/test_image_processing_aria.py index f366c6b028..7974a27129 100644 --- a/tests/models/aria/test_image_processing_aria.py +++ b/tests/models/aria/test_image_processing_aria.py @@ -302,3 +302,19 @@ class AriaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): encoded_images.shape[:-1] if input_data_format == ChannelDimension.LAST else encoded_images.shape[1:] ) self.assertEqual(encoded_image_shape, image_shape) + + def test_get_num_patches_without_images(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + num_patches = image_processing.get_number_of_image_patches(height=100, width=100, images_kwargs={}) + self.assertEqual(num_patches, 1) + + num_patches = image_processing.get_number_of_image_patches( + height=300, width=500, images_kwargs={"split_image": True} + ) + self.assertEqual(num_patches, 1) + + num_patches = image_processing.get_number_of_image_patches( + height=100, width=100, images_kwargs={"split_image": True, "max_image_size": 200} + ) + self.assertEqual(num_patches, 19) diff --git a/tests/models/aria/test_processor_aria.py b/tests/models/aria/test_processor_aria.py index 9df833661a..4c228d3c16 100644 --- a/tests/models/aria/test_processor_aria.py +++ b/tests/models/aria/test_processor_aria.py @@ -95,6 +95,19 @@ class AriaProcessorTest(ProcessorTesterMixin, unittest.TestCase): def tearDownClass(cls): shutil.rmtree(cls.tmpdirname, ignore_errors=True) + # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens + def test_get_num_vision_tokens(self): + "Tests general functionality of the helper used internally in vLLM" + + processor = self.get_processor() + + output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)]) + self.assertTrue("num_image_tokens" in output) + self.assertEqual(len(output["num_image_tokens"]), 3) + + self.assertTrue("num_image_patches" in output) + self.assertEqual(len(output["num_image_patches"]), 3) + def test_process_interleaved_images_prompts_image_splitting(self): processor = self.get_processor() processor.image_processor.split_image = True diff --git a/tests/models/aya_vision/test_processor_aya_vision.py b/tests/models/aya_vision/test_processor_aya_vision.py index 4e17bea44f..b768f08a03 100644 --- a/tests/models/aya_vision/test_processor_aya_vision.py +++ b/tests/models/aya_vision/test_processor_aya_vision.py @@ -80,6 +80,19 @@ class AyaVisionProcessorTest(ProcessorTesterMixin, unittest.TestCase): def tearDownClass(cls): shutil.rmtree(cls.tmpdirname, ignore_errors=True) + # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens + def test_get_num_vision_tokens(self): + "Tests general functionality of the helper used internally in vLLM" + + processor = self.get_processor() + + output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)]) + self.assertTrue("num_image_tokens" in output) + self.assertEqual(len(output["num_image_tokens"]), 3) + + self.assertTrue("num_image_patches" in output) + self.assertEqual(len(output["num_image_patches"]), 3) + @require_torch def test_process_interleaved_images_videos(self): processor = self.get_processor() diff --git a/tests/models/chameleon/test_processor_chameleon.py b/tests/models/chameleon/test_processor_chameleon.py index d11321c9a8..57f3b810af 100644 --- a/tests/models/chameleon/test_processor_chameleon.py +++ b/tests/models/chameleon/test_processor_chameleon.py @@ -74,3 +74,16 @@ class ChameleonProcessorTest(ProcessorTesterMixin, unittest.TestCase): @staticmethod def prepare_processor_dict(): return {"image_seq_length": 2} # fmt: skip + + # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens + def test_get_num_vision_tokens(self): + "Tests general functionality of the helper used internally in vLLM" + + processor = self.get_processor() + + output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)]) + self.assertTrue("num_image_tokens" in output) + self.assertEqual(len(output["num_image_tokens"]), 3) + + self.assertTrue("num_image_patches" in output) + self.assertEqual(len(output["num_image_patches"]), 3) diff --git a/tests/models/colpali/test_processing_colpali.py b/tests/models/colpali/test_processing_colpali.py index 7a51786158..539b604a35 100644 --- a/tests/models/colpali/test_processing_colpali.py +++ b/tests/models/colpali/test_processing_colpali.py @@ -54,6 +54,19 @@ class ColPaliProcessorTest(ProcessorTesterMixin, unittest.TestCase): def tearDownClass(cls): shutil.rmtree(cls.tmpdirname, ignore_errors=True) + # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens + def test_get_num_vision_tokens(self): + "Tests general functionality of the helper used internally in vLLM" + + processor = self.get_processor() + + output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)]) + self.assertTrue("num_image_tokens" in output) + self.assertEqual(len(output["num_image_tokens"]), 3) + + self.assertTrue("num_image_patches" in output) + self.assertEqual(len(output["num_image_patches"]), 3) + @require_torch @require_vision def test_process_images(self): diff --git a/tests/models/colqwen2/test_processing_colqwen2.py b/tests/models/colqwen2/test_processing_colqwen2.py index 0da0ce86b4..25e6b523c8 100644 --- a/tests/models/colqwen2/test_processing_colqwen2.py +++ b/tests/models/colqwen2/test_processing_colqwen2.py @@ -57,6 +57,19 @@ class ColQwen2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): def tearDownClass(cls): shutil.rmtree(cls.tmpdirname) + # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens + def test_get_num_vision_tokens(self): + "Tests general functionality of the helper used internally in vLLM" + + processor = self.get_processor() + + output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)]) + self.assertTrue("num_image_tokens" in output) + self.assertEqual(len(output["num_image_tokens"]), 3) + + self.assertTrue("num_image_patches" in output) + self.assertEqual(len(output["num_image_patches"]), 3) + def test_process_images(self): # Processor configuration image_input = self.prepare_image_inputs() diff --git a/tests/models/emu3/test_processor_emu3.py b/tests/models/emu3/test_processor_emu3.py index c595a91ee9..bb7c8187e5 100644 --- a/tests/models/emu3/test_processor_emu3.py +++ b/tests/models/emu3/test_processor_emu3.py @@ -90,3 +90,16 @@ class Emu3ProcessorTest(ProcessorTesterMixin, unittest.TestCase): # For an image where pixels go from 0 to 255 the diff can be 1 due to some numerical precision errors when scaling and unscaling self.assertTrue(np.abs(orig_image - unnormalized_images).max() >= 1) + + # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens + def test_get_num_vision_tokens(self): + "Tests general functionality of the helper used internally in vLLM" + + processor = self.get_processor() + + output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)]) + self.assertTrue("num_image_tokens" in output) + self.assertEqual(len(output["num_image_tokens"]), 3) + + self.assertTrue("num_image_patches" in output) + self.assertEqual(len(output["num_image_patches"]), 3) diff --git a/tests/models/fuyu/test_processor_fuyu.py b/tests/models/fuyu/test_processor_fuyu.py index 1f2c754bd5..6fb935cbec 100644 --- a/tests/models/fuyu/test_processor_fuyu.py +++ b/tests/models/fuyu/test_processor_fuyu.py @@ -64,6 +64,19 @@ class FuyuProcessingTest(ProcessorTesterMixin, unittest.TestCase): def get_image_processor(self, **kwargs): return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor + # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens + def test_get_num_vision_tokens(self): + "Tests general functionality of the helper used internally in vLLM" + + processor = self.get_processor() + + output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)]) + self.assertTrue("num_image_tokens" in output) + self.assertEqual(len(output["num_image_tokens"]), 3) + + self.assertTrue("num_image_patches" in output) + self.assertEqual(len(output["num_image_patches"]), 3) + def test_fuyu_processing(self): """ Test to ensure that the standard processing on a gold example matches adept's code. diff --git a/tests/models/gemma3/test_processing_gemma3.py b/tests/models/gemma3/test_processing_gemma3.py index 30587a8f55..98984a3c08 100644 --- a/tests/models/gemma3/test_processing_gemma3.py +++ b/tests/models/gemma3/test_processing_gemma3.py @@ -58,6 +58,19 @@ class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor.save_pretrained(cls.tmpdirname) cls.image_token = processor.boi_token + # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens + def test_get_num_vision_tokens(self): + "Tests general functionality of the helper used internally in vLLM" + + processor = self.get_processor() + + output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)]) + self.assertTrue("num_image_tokens" in output) + self.assertEqual(len(output["num_image_tokens"]), 3) + + self.assertTrue("num_image_patches" in output) + self.assertEqual(len(output["num_image_patches"]), 3) + @classmethod def tearDownClass(cls): shutil.rmtree(cls.tmpdirname, ignore_errors=True) diff --git a/tests/models/got_ocr2/test_image_processing_got_ocr2.py b/tests/models/got_ocr2/test_image_processing_got_ocr2.py index 53b44eba61..ccfcf2f062 100644 --- a/tests/models/got_ocr2/test_image_processing_got_ocr2.py +++ b/tests/models/got_ocr2/test_image_processing_got_ocr2.py @@ -169,3 +169,24 @@ class GotOcr2ProcessingTest(ImageProcessingTestMixin, unittest.TestCase): ) self.assertEqual(len(processed_images[0]), 5) self.assertEqual(processed_images.shape[-2:], (20, 20)) + + def test_get_num_patches_without_images(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + num_patches = image_processing.get_number_of_image_patches(height=100, width=100, images_kwargs={}) + self.assertEqual(num_patches, 1) + + num_patches = image_processing.get_number_of_image_patches( + height=300, width=500, images_kwargs={"crop_to_patches": False} + ) + self.assertEqual(num_patches, 1) + + num_patches = image_processing.get_number_of_image_patches( + height=100, width=100, images_kwargs={"crop_to_patches": True} + ) + self.assertEqual(num_patches, 10) + + num_patches = image_processing.get_number_of_image_patches( + height=100, width=100, images_kwargs={"crop_to_patches": True, "max_patches": 200} + ) + self.assertEqual(num_patches, 50) diff --git a/tests/models/idefics3/test_image_processing_idefics3.py b/tests/models/idefics3/test_image_processing_idefics3.py index 7a1eb4f44f..f855de2824 100644 --- a/tests/models/idefics3/test_image_processing_idefics3.py +++ b/tests/models/idefics3/test_image_processing_idefics3.py @@ -358,3 +358,28 @@ class Idefics3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): ) self.assertEqual(encoding_slow.rows, encoding_fast.rows) self.assertEqual(encoding_slow.cols, encoding_fast.cols) + + def test_get_num_patches_without_images(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + num_patches_and_row_cols = image_processing.get_number_of_image_patches( + height=100, width=100, images_kwargs={} + ) + self.assertEqual(num_patches_and_row_cols, (5, 2, 2)) + + num_patches_and_row_cols = image_processing.get_number_of_image_patches( + height=300, width=500, images_kwargs={"do_image_splitting": False} + ) + self.assertEqual(num_patches_and_row_cols, (1, 1, 1)) + + num_patches_and_row_cols = image_processing.get_number_of_image_patches( + height=300, width=500, images_kwargs={"do_image_splitting": True} + ) + self.assertEqual(num_patches_and_row_cols, (5, 2, 2)) + + num_patches_and_row_cols = image_processing.get_number_of_image_patches( + height=300, + width=600, + images_kwargs={"do_image_splitting": True, "max_image_size": {"longest_edge": 30}}, + ) + self.assertEqual(num_patches_and_row_cols, (3, 1, 2)) diff --git a/tests/models/idefics3/test_processor_idefics3.py b/tests/models/idefics3/test_processor_idefics3.py index 99b931a12c..7020a24398 100644 --- a/tests/models/idefics3/test_processor_idefics3.py +++ b/tests/models/idefics3/test_processor_idefics3.py @@ -84,6 +84,19 @@ class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase): def prepare_processor_dict(): return {"image_seq_len": 2} + # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens + def test_get_num_vision_tokens(self): + "Tests general functionality of the helper used internally in vLLM" + + processor = self.get_processor() + + output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)]) + self.assertTrue("num_image_tokens" in output) + self.assertEqual(len(output["num_image_tokens"]), 3) + + self.assertTrue("num_image_patches" in output) + self.assertEqual(len(output["num_image_patches"]), 3) + def get_split_image_expected_tokens(self, processor, image_rows, image_cols): text_split_images = [] for n_h in range(image_rows): diff --git a/tests/models/internvl/test_processor_internvl.py b/tests/models/internvl/test_processor_internvl.py index 614c5b4866..f3340e8af0 100644 --- a/tests/models/internvl/test_processor_internvl.py +++ b/tests/models/internvl/test_processor_internvl.py @@ -97,6 +97,19 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase): def tearDownClass(cls): shutil.rmtree(cls.tmpdirname, ignore_errors=True) + # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens + def test_get_num_vision_tokens(self): + "Tests general functionality of the helper used internally in vLLM" + + processor = self.get_processor() + + output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)]) + self.assertTrue("num_image_tokens" in output) + self.assertEqual(len(output["num_image_tokens"]), 3) + + self.assertTrue("num_image_patches" in output) + self.assertEqual(len(output["num_image_patches"]), 3) + @require_av @require_torch def test_process_interleaved_images_videos(self): diff --git a/tests/models/llava/test_processor_llava.py b/tests/models/llava/test_processor_llava.py index d89601d78b..41b9d8a09e 100644 --- a/tests/models/llava/test_processor_llava.py +++ b/tests/models/llava/test_processor_llava.py @@ -61,6 +61,18 @@ class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase): "vision_feature_select_strategy": "default" } # fmt: skip + def test_get_num_vision_tokens(self): + "Tests general functionality of the helper used internally in vLLM" + + processor = self.get_processor() + + output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)]) + self.assertTrue("num_image_tokens" in output) + self.assertEqual(len(output["num_image_tokens"]), 3) + + self.assertTrue("num_image_patches" in output) + self.assertEqual(len(output["num_image_patches"]), 3) + def test_chat_template_is_saved(self): processor_loaded = self.processor_class.from_pretrained(self.tmpdirname) processor_dict_loaded = json.loads(processor_loaded.to_json_string()) diff --git a/tests/models/llava_next/test_processor_llava_next.py b/tests/models/llava_next/test_processor_llava_next.py index 2adf527d78..d6156adb75 100644 --- a/tests/models/llava_next/test_processor_llava_next.py +++ b/tests/models/llava_next/test_processor_llava_next.py @@ -66,6 +66,19 @@ class LlavaNextProcessorTest(ProcessorTesterMixin, unittest.TestCase): "vision_feature_select_strategy": "default" } # fmt: skip + # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens + def test_get_num_vision_tokens(self): + "Tests general functionality of the helper used internally in vLLM" + + processor = self.get_processor() + + output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)]) + self.assertTrue("num_image_tokens" in output) + self.assertEqual(len(output["num_image_tokens"]), 3) + + self.assertTrue("num_image_patches" in output) + self.assertEqual(len(output["num_image_patches"]), 3) + # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved def test_chat_template_is_saved(self): processor_loaded = self.processor_class.from_pretrained(self.tmpdirname) diff --git a/tests/models/llava_next_video/test_processor_llava_next_video.py b/tests/models/llava_next_video/test_processor_llava_next_video.py index b902b8c496..17bcb3657d 100644 --- a/tests/models/llava_next_video/test_processor_llava_next_video.py +++ b/tests/models/llava_next_video/test_processor_llava_next_video.py @@ -75,6 +75,19 @@ class LlavaNextVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase): "vision_feature_select_strategy": "default", } + # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens + def test_get_num_vision_tokens(self): + "Tests general functionality of the helper used internally in vLLM" + + processor = self.get_processor() + + output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)]) + self.assertTrue("num_image_tokens" in output) + self.assertEqual(len(output["num_image_tokens"]), 3) + + self.assertTrue("num_image_patches" in output) + self.assertEqual(len(output["num_image_patches"]), 3) + # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved def test_chat_template_is_saved(self): processor_loaded = self.processor_class.from_pretrained(self.tmpdirname) diff --git a/tests/models/llava_onevision/test_processor_llava_onevision.py b/tests/models/llava_onevision/test_processor_llava_onevision.py index 52f2b99f92..1eb3b0d0d4 100644 --- a/tests/models/llava_onevision/test_processor_llava_onevision.py +++ b/tests/models/llava_onevision/test_processor_llava_onevision.py @@ -79,6 +79,19 @@ class LlavaOnevisionProcessorTest(ProcessorTesterMixin, unittest.TestCase): "vision_feature_select_strategy": "default" } # fmt: skip + # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens + def test_get_num_vision_tokens(self): + "Tests general functionality of the helper used internally in vLLM" + + processor = self.get_processor() + + output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)]) + self.assertTrue("num_image_tokens" in output) + self.assertEqual(len(output["num_image_tokens"]), 3) + + self.assertTrue("num_image_patches" in output) + self.assertEqual(len(output["num_image_patches"]), 3) + # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved def test_chat_template_is_saved(self): processor_loaded = self.processor_class.from_pretrained(self.tmpdirname) diff --git a/tests/models/paligemma/test_processor_paligemma.py b/tests/models/paligemma/test_processor_paligemma.py index 56e7492892..821e18d550 100644 --- a/tests/models/paligemma/test_processor_paligemma.py +++ b/tests/models/paligemma/test_processor_paligemma.py @@ -48,6 +48,19 @@ class PaliGemmaProcessorTest(ProcessorTesterMixin, unittest.TestCase): def tearDownClass(cls): shutil.rmtree(cls.tmpdirname, ignore_errors=True) + # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens + def test_get_num_vision_tokens(self): + "Tests general functionality of the helper used internally in vLLM" + + processor = self.get_processor() + + output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)]) + self.assertTrue("num_image_tokens" in output) + self.assertEqual(len(output["num_image_tokens"]), 3) + + self.assertTrue("num_image_patches" in output) + self.assertEqual(len(output["num_image_patches"]), 3) + @require_torch @require_vision def test_image_seq_length(self): diff --git a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py index b8aa49b004..c3f478950f 100644 --- a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py +++ b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py @@ -65,6 +65,19 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase): def tearDownClass(cls): shutil.rmtree(cls.tmpdirname, ignore_errors=True) + # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens + def test_get_num_vision_tokens(self): + "Tests general functionality of the helper used internally in vLLM" + + processor = self.get_processor() + + output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)]) + self.assertTrue("num_image_tokens" in output) + self.assertEqual(len(output["num_image_tokens"]), 3) + + self.assertTrue("num_image_patches" in output) + self.assertEqual(len(output["num_image_patches"]), 3) + def test_save_load_pretrained_default(self): tokenizer = self.get_tokenizer() image_processor = self.get_image_processor() diff --git a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py index 6ff2fa70c0..d17cd690cf 100644 --- a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py @@ -394,3 +394,17 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): self._assert_slow_fast_tensors_equivalence( encoding_slow.image_grid_thw.float(), encoding_fast.image_grid_thw.float() ) + + def test_get_num_patches_without_images(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + num_patches = image_processing.get_number_of_image_patches(height=100, width=100, images_kwargs={}) + self.assertEqual(num_patches, 64) + + num_patches = image_processing.get_number_of_image_patches(height=200, width=50, images_kwargs={}) + self.assertEqual(num_patches, 56) + + num_patches = image_processing.get_number_of_image_patches( + height=100, width=100, images_kwargs={"patch_size": 28} + ) + self.assertEqual(num_patches, 16) diff --git a/tests/models/qwen2_vl/test_processor_qwen2_vl.py b/tests/models/qwen2_vl/test_processor_qwen2_vl.py index eb5fdc79d0..69fae59595 100644 --- a/tests/models/qwen2_vl/test_processor_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_processor_qwen2_vl.py @@ -68,6 +68,19 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase): def tearDownClass(cls): shutil.rmtree(cls.tmpdirname, ignore_errors=True) + # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens + def test_get_num_vision_tokens(self): + "Tests general functionality of the helper used internally in vLLM" + + processor = self.get_processor() + + output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)]) + self.assertTrue("num_image_tokens" in output) + self.assertEqual(len(output["num_image_tokens"]), 3) + + self.assertTrue("num_image_patches" in output) + self.assertEqual(len(output["num_image_patches"]), 3) + def test_save_load_pretrained_default(self): tokenizer = self.get_tokenizer() image_processor = self.get_image_processor() diff --git a/tests/models/smolvlm/test_image_processing_smolvlm.py b/tests/models/smolvlm/test_image_processing_smolvlm.py index 34b893f817..be687e7952 100644 --- a/tests/models/smolvlm/test_image_processing_smolvlm.py +++ b/tests/models/smolvlm/test_image_processing_smolvlm.py @@ -358,3 +358,28 @@ class SmolVLMImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): ) self.assertEqual(encoding_slow.rows, encoding_fast.rows) self.assertEqual(encoding_slow.cols, encoding_fast.cols) + + def test_get_num_patches_without_images(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + num_patches_and_row_cols = image_processing.get_number_of_image_patches( + height=100, width=100, images_kwargs={} + ) + self.assertEqual(num_patches_and_row_cols, (5, 2, 2)) + + num_patches_and_row_cols = image_processing.get_number_of_image_patches( + height=300, width=500, images_kwargs={"do_image_splitting": False} + ) + self.assertEqual(num_patches_and_row_cols, (1, 1, 1)) + + num_patches_and_row_cols = image_processing.get_number_of_image_patches( + height=300, width=500, images_kwargs={"do_image_splitting": True} + ) + self.assertEqual(num_patches_and_row_cols, (5, 2, 2)) + + num_patches_and_row_cols = image_processing.get_number_of_image_patches( + height=300, + width=600, + images_kwargs={"do_image_splitting": True, "max_image_size": {"longest_edge": 30}}, + ) + self.assertEqual(num_patches_and_row_cols, (3, 1, 2))