From 8b237b86398e108447427825703f7a80780785aa Mon Sep 17 00:00:00 2001
From: Raushan Turganbay <raushan@huggingface.co>
Date: Mon, 28 Jul 2025 11:41:58 +0200
Subject: [PATCH] [processors] add tests for helper fn (#39629)

* add tests for helpers

* duplicate test for each model

* why llava next video has no helper

* oops must have been in the commit

* fix test after rebase

* add copy from
---
 .../models/aria/image_processing_aria.py      |  4 +-
 src/transformers/models/aria/modular_aria.py  |  4 +-
 .../models/colpali/processing_colpali.py      |  5 +--
 .../models/colqwen2/modular_colqwen2.py       | 28 ++++++++++++-
 .../models/colqwen2/processing_colqwen2.py    | 21 ++++++----
 .../models/glm4v/image_processing_glm4v.py    |  4 +-
 .../got_ocr2/image_processing_got_ocr2.py     | 10 +++--
 .../image_processing_got_ocr2_fast.py         | 12 +++---
 .../idefics3/image_processing_idefics3.py     |  8 ++--
 .../image_processing_idefics3_fast.py         |  8 ++--
 .../models/internvl/processing_internvl.py    |  2 +-
 .../llava_next/processing_llava_next.py       |  9 +----
 .../processing_llava_next_video.py            | 39 ++++++++++++++++++-
 .../models/paligemma/processing_paligemma.py  |  5 +--
 .../qwen2_vl/image_processing_qwen2_vl.py     |  8 ++--
 .../image_processing_qwen2_vl_fast.py         |  8 ++--
 .../smolvlm/image_processing_smolvlm.py       |  8 ++--
 .../smolvlm/image_processing_smolvlm_fast.py  |  8 ++--
 .../models/aria/test_image_processing_aria.py | 16 ++++++++
 tests/models/aria/test_processor_aria.py      | 13 +++++++
 .../aya_vision/test_processor_aya_vision.py   | 13 +++++++
 .../chameleon/test_processor_chameleon.py     | 13 +++++++
 .../models/colpali/test_processing_colpali.py | 13 +++++++
 .../colqwen2/test_processing_colqwen2.py      | 13 +++++++
 tests/models/emu3/test_processor_emu3.py      | 13 +++++++
 tests/models/fuyu/test_processor_fuyu.py      | 13 +++++++
 tests/models/gemma3/test_processing_gemma3.py | 13 +++++++
 .../test_image_processing_got_ocr2.py         | 21 ++++++++++
 .../test_image_processing_idefics3.py         | 25 ++++++++++++
 .../idefics3/test_processor_idefics3.py       | 13 +++++++
 .../internvl/test_processor_internvl.py       | 13 +++++++
 tests/models/llava/test_processor_llava.py    | 12 ++++++
 .../llava_next/test_processor_llava_next.py   | 13 +++++++
 .../test_processor_llava_next_video.py        | 13 +++++++
 .../test_processor_llava_onevision.py         | 13 +++++++
 .../paligemma/test_processor_paligemma.py     | 13 +++++++
 .../qwen2_5_vl/test_processor_qwen2_5_vl.py   | 13 +++++++
 .../test_image_processing_qwen2_vl.py         | 14 +++++++
 .../qwen2_vl/test_processor_qwen2_vl.py       | 13 +++++++
 .../smolvlm/test_image_processing_smolvlm.py  | 25 ++++++++++++
 40 files changed, 454 insertions(+), 58 deletions(-)

diff --git a/src/transformers/models/aria/image_processing_aria.py b/src/transformers/models/aria/image_processing_aria.py
index d2b6c21a7f..4d0ae92dd0 100644
--- a/src/transformers/models/aria/image_processing_aria.py
+++ b/src/transformers/models/aria/image_processing_aria.py
@@ -515,8 +515,8 @@ class AriaImageProcessor(BaseImageProcessor):
         Returns:
             `int`: Number of patches per image.
         """
-        split_image = images_kwargs.get("split_image", None) or self.split_image
-        max_image_size = images_kwargs.get("max_image_size", None) or self.max_image_size
+        split_image = images_kwargs["split_image"] if "split_image" in images_kwargs else self.split_image
+        max_image_size = images_kwargs["max_image_size"] if "max_image_size" in images_kwargs else self.max_image_size
 
         resized_height, resized_width = select_best_resolution((height, width), self.split_resolutions)
         num_patches = 1 if not split_image else resized_height // max_image_size * resized_width // max_image_size
diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py
index d980898460..a531bc43b3 100644
--- a/src/transformers/models/aria/modular_aria.py
+++ b/src/transformers/models/aria/modular_aria.py
@@ -901,8 +901,8 @@ class AriaImageProcessor(BaseImageProcessor):
         Returns:
             `int`: Number of patches per image.
         """
-        split_image = images_kwargs.get("split_image", None) or self.split_image
-        max_image_size = images_kwargs.get("max_image_size", None) or self.max_image_size
+        split_image = images_kwargs["split_image"] if "split_image" in images_kwargs else self.split_image
+        max_image_size = images_kwargs["max_image_size"] if "max_image_size" in images_kwargs else self.max_image_size
 
         resized_height, resized_width = select_best_resolution((height, width), self.split_resolutions)
         num_patches = 1 if not split_image else resized_height // max_image_size * resized_width // max_image_size
diff --git a/src/transformers/models/colpali/processing_colpali.py b/src/transformers/models/colpali/processing_colpali.py
index 759209beac..e0d8118d44 100644
--- a/src/transformers/models/colpali/processing_colpali.py
+++ b/src/transformers/models/colpali/processing_colpali.py
@@ -264,9 +264,8 @@ class ColPaliProcessor(ProcessorMixin):
             image_sizes (list[list[str]], *optional*):
                 The input sizes formatted as (height, width) per each image.
         Returns:
-            dict[str, list[int]]: A dictionary mapping each modality ("image", "video", "audio")
-            to a list containing the number of placeholder tokens required. If the model doesn't accept
-            a certain modality or no input sizes are provided, the dict value is set to an empty list.
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
         """
         vision_data = {}
         if image_sizes is not None:
diff --git a/src/transformers/models/colqwen2/modular_colqwen2.py b/src/transformers/models/colqwen2/modular_colqwen2.py
index 8e06d2ef32..5c7bfb2dc0 100644
--- a/src/transformers/models/colqwen2/modular_colqwen2.py
+++ b/src/transformers/models/colqwen2/modular_colqwen2.py
@@ -22,7 +22,7 @@ from transformers.models.colpali.processing_colpali import ColPaliProcessor
 from ...cache_utils import Cache
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, is_valid_image
-from ...processing_utils import ProcessingKwargs, Unpack
+from ...processing_utils import MultiModalData, ProcessingKwargs, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import ModelOutput, auto_docstring, can_return_tuple, is_torch_available, logging
 from .configuration_colqwen2 import ColQwen2Config
@@ -224,6 +224,32 @@ class ColQwen2Processor(ColPaliProcessor):
 
             return batch_query
 
+    def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+        Args:
+            image_sizes (`list[list[int]]`, *optional*):
+                The input sizes formatted as (height, width) per each image.
+        Returns:
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
+        """
+
+        vision_data = {}
+        if image_sizes is not None:
+            images_kwargs = ColQwen2ProcessorKwargs._defaults.get("images_kwargs", {})
+            images_kwargs.update(kwargs)
+            merge_size = images_kwargs.get("merge_size", None) or self.image_processor.merge_size
+
+            num_image_patches = [
+                self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
+                for image_size in image_sizes
+            ]
+            num_image_tokens = [(num_patches // merge_size**2) for num_patches in num_image_patches]
+            vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
+
+        return MultiModalData(**vision_data)
+
 
 class ColQwen2PreTrainedModel(ColPaliPreTrainedModel):
     pass
diff --git a/src/transformers/models/colqwen2/processing_colqwen2.py b/src/transformers/models/colqwen2/processing_colqwen2.py
index 59af4bdd42..a8b99380ac 100644
--- a/src/transformers/models/colqwen2/processing_colqwen2.py
+++ b/src/transformers/models/colqwen2/processing_colqwen2.py
@@ -226,20 +226,27 @@ class ColQwen2Processor(ProcessorMixin):
     def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
         """
         Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
-
         Args:
-            image_sizes (list[list[str]], *optional*):
+            image_sizes (`list[list[int]]`, *optional*):
                 The input sizes formatted as (height, width) per each image.
         Returns:
-            dict[str, list[int]]: A dictionary mapping each modality ("image", "video", "audio")
-            to a list containing the number of placeholder tokens required. If the model doesn't accept
-            a certain modality or no input sizes are provided, the dict value is set to an empty list.
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
         """
+
         vision_data = {}
         if image_sizes is not None:
-            num_image_tokens = [self.image_seq_length] * len(image_sizes)
-            num_image_patches = [1] * len(image_sizes)
+            images_kwargs = ColQwen2ProcessorKwargs._defaults.get("images_kwargs", {})
+            images_kwargs.update(kwargs)
+            merge_size = images_kwargs.get("merge_size", None) or self.image_processor.merge_size
+
+            num_image_patches = [
+                self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
+                for image_size in image_sizes
+            ]
+            num_image_tokens = [(num_patches // merge_size**2) for num_patches in num_image_patches]
             vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
+
         return MultiModalData(**vision_data)
 
     def batch_decode(self, *args, **kwargs):
diff --git a/src/transformers/models/glm4v/image_processing_glm4v.py b/src/transformers/models/glm4v/image_processing_glm4v.py
index d991a09548..2b4f9aa24b 100644
--- a/src/transformers/models/glm4v/image_processing_glm4v.py
+++ b/src/transformers/models/glm4v/image_processing_glm4v.py
@@ -449,8 +449,8 @@ class Glm4vImageProcessor(BaseImageProcessor):
         Returns:
             `int`: Number of image patches per image.
         """
-        patch_size = images_kwargs.get("patch_size", None) or self.patch_size
-        merge_size = images_kwargs.get("merge_size", None) or self.merge_size
+        patch_size = images_kwargs["patch_size"] if "patch_size" in images_kwargs else self.patch_size
+        merge_size = images_kwargs["merge_size"] if "merge_size" in images_kwargs else self.merge_size
 
         factor = patch_size * merge_size
         resized_height, resized_width = smart_resize(
diff --git a/src/transformers/models/got_ocr2/image_processing_got_ocr2.py b/src/transformers/models/got_ocr2/image_processing_got_ocr2.py
index f2698310ed..a1f0eca4cc 100644
--- a/src/transformers/models/got_ocr2/image_processing_got_ocr2.py
+++ b/src/transformers/models/got_ocr2/image_processing_got_ocr2.py
@@ -505,10 +505,12 @@ class GotOcr2ImageProcessor(BaseImageProcessor):
         Returns:
             `int`: Number of patches per image.
         """
-        min_patches = images_kwargs.get("min_patches", None) or self.min_patches
-        max_patches = images_kwargs.get("max_patches", None) or self.max_patches
-        patch_size = images_kwargs.get("size", None) or self.size
-        crop_to_patches = images_kwargs.get("crop_to_patches", None) or self.crop_to_patches
+        min_patches = images_kwargs["min_patches"] if "min_patches" in images_kwargs else self.min_patches
+        max_patches = images_kwargs["max_patches"] if "max_patches" in images_kwargs else self.max_patches
+        patch_size = images_kwargs["patch_size"] if "patch_size" in images_kwargs else self.size
+        crop_to_patches = (
+            images_kwargs["crop_to_patches"] if "crop_to_patches" in images_kwargs else self.crop_to_patches
+        )
 
         num_patches = 1
         if crop_to_patches and max_patches > 1:
diff --git a/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py b/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py
index 452b4c3f58..04cf09fe39 100644
--- a/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py
+++ b/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py
@@ -223,7 +223,7 @@ class GotOcr2ImageProcessorFast(BaseImageProcessorFast):
             data={"pixel_values": processed_images, "num_patches": num_patches}, tensor_type=return_tensors
         )
 
-    def get_number_of_image_tokens(self, height: int, width: int, images_kwargs=None):
+    def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
         """
         A utility that returns number patches for a given image size.
 
@@ -237,10 +237,12 @@ class GotOcr2ImageProcessorFast(BaseImageProcessorFast):
         Returns:
             `int`: Number of patches per image.
         """
-        min_patches = images_kwargs.get("min_patches", None) or self.min_patches
-        max_patches = images_kwargs.get("max_patches", None) or self.max_patches
-        patch_size = images_kwargs.get("size", None) or self.size
-        crop_to_patches = images_kwargs.get("crop_to_patches", None) or self.crop_to_patches
+        min_patches = images_kwargs["min_patches"] if "min_patches" in images_kwargs else self.min_patches
+        max_patches = images_kwargs["max_patches"] if "max_patches" in images_kwargs else self.max_patches
+        patch_size = images_kwargs["patch_size"] if "patch_size" in images_kwargs else self.size
+        crop_to_patches = (
+            images_kwargs["crop_to_patches"] if "crop_to_patches" in images_kwargs else self.crop_to_patches
+        )
 
         num_patches = 1
         if crop_to_patches and max_patches > 1:
diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index 65ad87dae4..194dd092bb 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -866,9 +866,11 @@ class Idefics3ImageProcessor(BaseImageProcessor):
         Returns:
             `int`: Number of patches per image.
         """
-        do_image_splitting = images_kwargs.get("do_image_splitting", None) or self.do_image_splitting
-        max_image_size = images_kwargs.get("max_image_size", None) or self.max_image_size
-        size = images_kwargs.get("size", None) or self.size
+        do_image_splitting = (
+            images_kwargs["do_image_splitting"] if "do_image_splitting" in images_kwargs else self.do_image_splitting
+        )
+        max_image_size = images_kwargs["max_image_size"] if "max_image_size" in images_kwargs else self.max_image_size
+        size = images_kwargs["size"] if "size" in images_kwargs else self.size
 
         num_patches = num_rows = num_cols = 1
         if do_image_splitting:
diff --git a/src/transformers/models/idefics3/image_processing_idefics3_fast.py b/src/transformers/models/idefics3/image_processing_idefics3_fast.py
index 8fdef6e378..a2251e7853 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3_fast.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3_fast.py
@@ -514,9 +514,11 @@ class Idefics3ImageProcessorFast(BaseImageProcessorFast):
         Returns:
             `int`: Number of patches per image.
         """
-        do_image_splitting = images_kwargs.get("do_image_splitting", None) or self.do_image_splitting
-        max_image_size = images_kwargs.get("max_image_size", None) or self.max_image_size
-        size = images_kwargs.get("size", None) or self.size
+        do_image_splitting = (
+            images_kwargs["do_image_splitting"] if "do_image_splitting" in images_kwargs else self.do_image_splitting
+        )
+        max_image_size = images_kwargs["max_image_size"] if "max_image_size" in images_kwargs else self.max_image_size
+        size = images_kwargs["size"] if "size" in images_kwargs else self.size
 
         num_patches = num_rows = num_cols = 1
         if do_image_splitting:
diff --git a/src/transformers/models/internvl/processing_internvl.py b/src/transformers/models/internvl/processing_internvl.py
index 0193af7080..61f6f04482 100644
--- a/src/transformers/models/internvl/processing_internvl.py
+++ b/src/transformers/models/internvl/processing_internvl.py
@@ -284,7 +284,7 @@ class InternVLProcessor(ProcessorMixin):
             images_kwargs.update(kwargs)
 
             num_image_patches = [
-                self.image_processor.get_number_of_image_tokens(*image_size, images_kwargs)
+                self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
                 for image_size in image_sizes
             ]
             # Add 2 for BOI and EOI tokens
diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py
index c5760aa169..92d8d56618 100644
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@@ -231,14 +231,9 @@ class LlavaNextProcessor(ProcessorMixin):
         Args:
             image_sizes (list[list[str]], *optional*):
                 The input sizes formatted as (height, width) per each image.
-            video_sizes (list[list[str]], *optional*):
-                The input sizes formatted as (num_frames, height, width) per each video.
-            audio_lengths (list[int], *optional*):
-                The input length formatted as per each audio.
         Returns:
-            dict[str, list[int]]: A dictionary mapping each modality ("image", "video", "audio")
-            to a list containing the number of placeholder tokens required. If the model doesn't accept
-            a certain modality or no input sizes are provided, the dict value is set to an empty list.
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
         """
         vision_data = {}
         if image_sizes is not None:
diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py
index b50aee5af2..e04c968c19 100644
--- a/src/transformers/models/llava_next_video/processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py
@@ -23,7 +23,7 @@ import numpy as np
 from ...feature_extraction_utils import BatchFeature
 from ...image_processing_utils import select_best_resolution
 from ...image_utils import ImageInput, get_image_size, to_numpy_array
-from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
 from ...video_utils import VideoInput
@@ -265,6 +265,43 @@ class LlavaNextVideoProcessor(ProcessorMixin):
         newline_features = current_height
         return (unpadded_features, newline_features)
 
+    def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+        Args:
+            image_sizes (list[list[str]], *optional*):
+                The input sizes formatted as (height, width) per each image.
+        Returns:
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
+        """
+        vision_data = {}
+        if image_sizes is not None:
+            images_kwargs = LlavaNextVideoProcessorKwargs._defaults.get("images_kwargs", {})
+            images_kwargs.update(kwargs)
+
+            size = images_kwargs.get("size", None) or self.image_processor.size
+            size = (
+                (size["shortest_edge"], size["shortest_edge"])
+                if "shortest_edge" in size
+                else (min(size["height"], size["width"]), min(size["height"], size["width"]))
+            )
+            processed_height, processed_width = size
+
+            batch_num_image_tokens = []
+            num_image_patches = [1] * len(image_sizes)  # llava-next doesn't batch pixels as Idefics, thus `1` patch`
+            for image_size in image_sizes:
+                orig_height, orig_width = image_size
+                num_image_tokens = self._get_number_of_features(
+                    orig_height, orig_width, processed_height, processed_width
+                )
+                if self.vision_feature_select_strategy == "default":
+                    num_image_tokens -= 1
+                batch_num_image_tokens.append(num_image_tokens)
+            vision_data.update({"num_image_tokens": batch_num_image_tokens, "num_image_patches": num_image_patches})
+
+        return MultiModalData(**vision_data)
+
     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/paligemma/processing_paligemma.py b/src/transformers/models/paligemma/processing_paligemma.py
index b4c8e555b5..e9629b2d2f 100644
--- a/src/transformers/models/paligemma/processing_paligemma.py
+++ b/src/transformers/models/paligemma/processing_paligemma.py
@@ -327,9 +327,8 @@ class PaliGemmaProcessor(ProcessorMixin):
             image_sizes (list[list[str]], *optional*):
                 The input sizes formatted as (height, width) per each image.
         Returns:
-            dict[str, list[int]]: A dictionary mapping each modality ("image", "video", "audio")
-            to a list containing the number of placeholder tokens required. If the model doesn't accept
-            a certain modality or no input sizes are provided, the dict value is set to an empty list.
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
         """
         vision_data = {}
         if image_sizes is not None:
diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
index 95b303346c..e42eeeef20 100644
--- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
@@ -502,10 +502,10 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
         Returns:
             `int`: Number of image patches per image.
         """
-        min_pixels = images_kwargs.get("min_pixels", None) or self.size["shortest_edge"]
-        max_pixels = images_kwargs.get("max_pixels", None) or self.size["longest_edge"]
-        patch_size = images_kwargs.get("patch_size", None) or self.patch_size
-        merge_size = images_kwargs.get("merge_size", None) or self.merge_size
+        min_pixels = images_kwargs["min_pixels"] if "min_pixels" in images_kwargs else self.size["shortest_edge"]
+        max_pixels = images_kwargs["max_pixels"] if "max_pixels" in images_kwargs else self.size["longest_edge"]
+        patch_size = images_kwargs["patch_size"] if "patch_size" in images_kwargs else self.patch_size
+        merge_size = images_kwargs["merge_size"] if "merge_size" in images_kwargs else self.merge_size
 
         factor = patch_size * merge_size
         resized_height, resized_width = smart_resize(
diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py
index 27628e2f74..cadecfbf3f 100644
--- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py
+++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py
@@ -299,10 +299,10 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
         Returns:
             `int`: Number of image patches per image.
         """
-        min_pixels = images_kwargs.get("min_pixels", None) or self.size["shortest_edge"]
-        max_pixels = images_kwargs.get("max_pixels", None) or self.size["longest_edge"]
-        patch_size = images_kwargs.get("patch_size", None) or self.patch_size
-        merge_size = images_kwargs.get("merge_size", None) or self.merge_size
+        min_pixels = images_kwargs["min_pixels"] if "min_pixels" in images_kwargs else self.size["shortest_edge"]
+        max_pixels = images_kwargs["max_pixels"] if "max_pixels" in images_kwargs else self.size["longest_edge"]
+        patch_size = images_kwargs["patch_size"] if "patch_size" in images_kwargs else self.patch_size
+        merge_size = images_kwargs["merge_size"] if "merge_size" in images_kwargs else self.merge_size
 
         factor = patch_size * merge_size
         resized_height, resized_width = smart_resize(
diff --git a/src/transformers/models/smolvlm/image_processing_smolvlm.py b/src/transformers/models/smolvlm/image_processing_smolvlm.py
index 431a6f32bb..440f263d0a 100644
--- a/src/transformers/models/smolvlm/image_processing_smolvlm.py
+++ b/src/transformers/models/smolvlm/image_processing_smolvlm.py
@@ -863,9 +863,11 @@ class SmolVLMImageProcessor(BaseImageProcessor):
         Returns:
             `int`: Number of patches per image.
         """
-        do_image_splitting = images_kwargs.get("do_image_splitting", None) or self.do_image_splitting
-        max_image_size = images_kwargs.get("max_image_size", None) or self.max_image_size
-        size = images_kwargs.get("size", None) or self.size
+        do_image_splitting = (
+            images_kwargs["do_image_splitting"] if "do_image_splitting" in images_kwargs else self.do_image_splitting
+        )
+        max_image_size = images_kwargs["max_image_size"] if "max_image_size" in images_kwargs else self.max_image_size
+        size = images_kwargs["size"] if "size" in images_kwargs else self.size
 
         num_patches = num_rows = num_cols = 1
         if do_image_splitting:
diff --git a/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py b/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py
index 1cfca31306..ecbd3a7e07 100644
--- a/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py
+++ b/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py
@@ -504,9 +504,11 @@ class SmolVLMImageProcessorFast(BaseImageProcessorFast):
         Returns:
             `int`: Number of patches per image.
         """
-        do_image_splitting = images_kwargs.get("do_image_splitting", None) or self.do_image_splitting
-        max_image_size = images_kwargs.get("max_image_size", None) or self.max_image_size
-        size = images_kwargs.get("size", None) or self.size
+        do_image_splitting = (
+            images_kwargs["do_image_splitting"] if "do_image_splitting" in images_kwargs else self.do_image_splitting
+        )
+        max_image_size = images_kwargs["max_image_size"] if "max_image_size" in images_kwargs else self.max_image_size
+        size = images_kwargs["size"] if "size" in images_kwargs else self.size
 
         num_patches = num_rows = num_cols = 1
         if do_image_splitting:
diff --git a/tests/models/aria/test_image_processing_aria.py b/tests/models/aria/test_image_processing_aria.py
index f366c6b028..7974a27129 100644
--- a/tests/models/aria/test_image_processing_aria.py
+++ b/tests/models/aria/test_image_processing_aria.py
@@ -302,3 +302,19 @@ class AriaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
                 encoded_images.shape[:-1] if input_data_format == ChannelDimension.LAST else encoded_images.shape[1:]
             )
             self.assertEqual(encoded_image_shape, image_shape)
+
+    def test_get_num_patches_without_images(self):
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            num_patches = image_processing.get_number_of_image_patches(height=100, width=100, images_kwargs={})
+            self.assertEqual(num_patches, 1)
+
+            num_patches = image_processing.get_number_of_image_patches(
+                height=300, width=500, images_kwargs={"split_image": True}
+            )
+            self.assertEqual(num_patches, 1)
+
+            num_patches = image_processing.get_number_of_image_patches(
+                height=100, width=100, images_kwargs={"split_image": True, "max_image_size": 200}
+            )
+            self.assertEqual(num_patches, 19)
diff --git a/tests/models/aria/test_processor_aria.py b/tests/models/aria/test_processor_aria.py
index 9df833661a..4c228d3c16 100644
--- a/tests/models/aria/test_processor_aria.py
+++ b/tests/models/aria/test_processor_aria.py
@@ -95,6 +95,19 @@ class AriaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     def tearDownClass(cls):
         shutil.rmtree(cls.tmpdirname, ignore_errors=True)
 
+    # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
+    def test_get_num_vision_tokens(self):
+        "Tests general functionality of the helper used internally in vLLM"
+
+        processor = self.get_processor()
+
+        output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
+        self.assertTrue("num_image_tokens" in output)
+        self.assertEqual(len(output["num_image_tokens"]), 3)
+
+        self.assertTrue("num_image_patches" in output)
+        self.assertEqual(len(output["num_image_patches"]), 3)
+
     def test_process_interleaved_images_prompts_image_splitting(self):
         processor = self.get_processor()
         processor.image_processor.split_image = True
diff --git a/tests/models/aya_vision/test_processor_aya_vision.py b/tests/models/aya_vision/test_processor_aya_vision.py
index 4e17bea44f..b768f08a03 100644
--- a/tests/models/aya_vision/test_processor_aya_vision.py
+++ b/tests/models/aya_vision/test_processor_aya_vision.py
@@ -80,6 +80,19 @@ class AyaVisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     def tearDownClass(cls):
         shutil.rmtree(cls.tmpdirname, ignore_errors=True)
 
+    # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
+    def test_get_num_vision_tokens(self):
+        "Tests general functionality of the helper used internally in vLLM"
+
+        processor = self.get_processor()
+
+        output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
+        self.assertTrue("num_image_tokens" in output)
+        self.assertEqual(len(output["num_image_tokens"]), 3)
+
+        self.assertTrue("num_image_patches" in output)
+        self.assertEqual(len(output["num_image_patches"]), 3)
+
     @require_torch
     def test_process_interleaved_images_videos(self):
         processor = self.get_processor()
diff --git a/tests/models/chameleon/test_processor_chameleon.py b/tests/models/chameleon/test_processor_chameleon.py
index d11321c9a8..57f3b810af 100644
--- a/tests/models/chameleon/test_processor_chameleon.py
+++ b/tests/models/chameleon/test_processor_chameleon.py
@@ -74,3 +74,16 @@ class ChameleonProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     @staticmethod
     def prepare_processor_dict():
         return {"image_seq_length": 2}  # fmt: skip
+
+    # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
+    def test_get_num_vision_tokens(self):
+        "Tests general functionality of the helper used internally in vLLM"
+
+        processor = self.get_processor()
+
+        output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
+        self.assertTrue("num_image_tokens" in output)
+        self.assertEqual(len(output["num_image_tokens"]), 3)
+
+        self.assertTrue("num_image_patches" in output)
+        self.assertEqual(len(output["num_image_patches"]), 3)
diff --git a/tests/models/colpali/test_processing_colpali.py b/tests/models/colpali/test_processing_colpali.py
index 7a51786158..539b604a35 100644
--- a/tests/models/colpali/test_processing_colpali.py
+++ b/tests/models/colpali/test_processing_colpali.py
@@ -54,6 +54,19 @@ class ColPaliProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     def tearDownClass(cls):
         shutil.rmtree(cls.tmpdirname, ignore_errors=True)
 
+    # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
+    def test_get_num_vision_tokens(self):
+        "Tests general functionality of the helper used internally in vLLM"
+
+        processor = self.get_processor()
+
+        output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
+        self.assertTrue("num_image_tokens" in output)
+        self.assertEqual(len(output["num_image_tokens"]), 3)
+
+        self.assertTrue("num_image_patches" in output)
+        self.assertEqual(len(output["num_image_patches"]), 3)
+
     @require_torch
     @require_vision
     def test_process_images(self):
diff --git a/tests/models/colqwen2/test_processing_colqwen2.py b/tests/models/colqwen2/test_processing_colqwen2.py
index 0da0ce86b4..25e6b523c8 100644
--- a/tests/models/colqwen2/test_processing_colqwen2.py
+++ b/tests/models/colqwen2/test_processing_colqwen2.py
@@ -57,6 +57,19 @@ class ColQwen2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     def tearDownClass(cls):
         shutil.rmtree(cls.tmpdirname)
 
+    # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
+    def test_get_num_vision_tokens(self):
+        "Tests general functionality of the helper used internally in vLLM"
+
+        processor = self.get_processor()
+
+        output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
+        self.assertTrue("num_image_tokens" in output)
+        self.assertEqual(len(output["num_image_tokens"]), 3)
+
+        self.assertTrue("num_image_patches" in output)
+        self.assertEqual(len(output["num_image_patches"]), 3)
+
     def test_process_images(self):
         # Processor configuration
         image_input = self.prepare_image_inputs()
diff --git a/tests/models/emu3/test_processor_emu3.py b/tests/models/emu3/test_processor_emu3.py
index c595a91ee9..bb7c8187e5 100644
--- a/tests/models/emu3/test_processor_emu3.py
+++ b/tests/models/emu3/test_processor_emu3.py
@@ -90,3 +90,16 @@ class Emu3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
 
         # For an image where pixels go from 0 to 255 the diff can be 1 due to some numerical precision errors when scaling and unscaling
         self.assertTrue(np.abs(orig_image - unnormalized_images).max() >= 1)
+
+    # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
+    def test_get_num_vision_tokens(self):
+        "Tests general functionality of the helper used internally in vLLM"
+
+        processor = self.get_processor()
+
+        output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
+        self.assertTrue("num_image_tokens" in output)
+        self.assertEqual(len(output["num_image_tokens"]), 3)
+
+        self.assertTrue("num_image_patches" in output)
+        self.assertEqual(len(output["num_image_patches"]), 3)
diff --git a/tests/models/fuyu/test_processor_fuyu.py b/tests/models/fuyu/test_processor_fuyu.py
index 1f2c754bd5..6fb935cbec 100644
--- a/tests/models/fuyu/test_processor_fuyu.py
+++ b/tests/models/fuyu/test_processor_fuyu.py
@@ -64,6 +64,19 @@ class FuyuProcessingTest(ProcessorTesterMixin, unittest.TestCase):
     def get_image_processor(self, **kwargs):
         return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
 
+    # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
+    def test_get_num_vision_tokens(self):
+        "Tests general functionality of the helper used internally in vLLM"
+
+        processor = self.get_processor()
+
+        output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
+        self.assertTrue("num_image_tokens" in output)
+        self.assertEqual(len(output["num_image_tokens"]), 3)
+
+        self.assertTrue("num_image_patches" in output)
+        self.assertEqual(len(output["num_image_patches"]), 3)
+
     def test_fuyu_processing(self):
         """
         Test to ensure that the standard processing on a gold example matches adept's code.
diff --git a/tests/models/gemma3/test_processing_gemma3.py b/tests/models/gemma3/test_processing_gemma3.py
index 30587a8f55..98984a3c08 100644
--- a/tests/models/gemma3/test_processing_gemma3.py
+++ b/tests/models/gemma3/test_processing_gemma3.py
@@ -58,6 +58,19 @@ class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         processor.save_pretrained(cls.tmpdirname)
         cls.image_token = processor.boi_token
 
+    # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
+    def test_get_num_vision_tokens(self):
+        "Tests general functionality of the helper used internally in vLLM"
+
+        processor = self.get_processor()
+
+        output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
+        self.assertTrue("num_image_tokens" in output)
+        self.assertEqual(len(output["num_image_tokens"]), 3)
+
+        self.assertTrue("num_image_patches" in output)
+        self.assertEqual(len(output["num_image_patches"]), 3)
+
     @classmethod
     def tearDownClass(cls):
         shutil.rmtree(cls.tmpdirname, ignore_errors=True)
diff --git a/tests/models/got_ocr2/test_image_processing_got_ocr2.py b/tests/models/got_ocr2/test_image_processing_got_ocr2.py
index 53b44eba61..ccfcf2f062 100644
--- a/tests/models/got_ocr2/test_image_processing_got_ocr2.py
+++ b/tests/models/got_ocr2/test_image_processing_got_ocr2.py
@@ -169,3 +169,24 @@ class GotOcr2ProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
         )
         self.assertEqual(len(processed_images[0]), 5)
         self.assertEqual(processed_images.shape[-2:], (20, 20))
+
+    def test_get_num_patches_without_images(self):
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            num_patches = image_processing.get_number_of_image_patches(height=100, width=100, images_kwargs={})
+            self.assertEqual(num_patches, 1)
+
+            num_patches = image_processing.get_number_of_image_patches(
+                height=300, width=500, images_kwargs={"crop_to_patches": False}
+            )
+            self.assertEqual(num_patches, 1)
+
+            num_patches = image_processing.get_number_of_image_patches(
+                height=100, width=100, images_kwargs={"crop_to_patches": True}
+            )
+            self.assertEqual(num_patches, 10)
+
+            num_patches = image_processing.get_number_of_image_patches(
+                height=100, width=100, images_kwargs={"crop_to_patches": True, "max_patches": 200}
+            )
+            self.assertEqual(num_patches, 50)
diff --git a/tests/models/idefics3/test_image_processing_idefics3.py b/tests/models/idefics3/test_image_processing_idefics3.py
index 7a1eb4f44f..f855de2824 100644
--- a/tests/models/idefics3/test_image_processing_idefics3.py
+++ b/tests/models/idefics3/test_image_processing_idefics3.py
@@ -358,3 +358,28 @@ class Idefics3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
         )
         self.assertEqual(encoding_slow.rows, encoding_fast.rows)
         self.assertEqual(encoding_slow.cols, encoding_fast.cols)
+
+    def test_get_num_patches_without_images(self):
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            num_patches_and_row_cols = image_processing.get_number_of_image_patches(
+                height=100, width=100, images_kwargs={}
+            )
+            self.assertEqual(num_patches_and_row_cols, (5, 2, 2))
+
+            num_patches_and_row_cols = image_processing.get_number_of_image_patches(
+                height=300, width=500, images_kwargs={"do_image_splitting": False}
+            )
+            self.assertEqual(num_patches_and_row_cols, (1, 1, 1))
+
+            num_patches_and_row_cols = image_processing.get_number_of_image_patches(
+                height=300, width=500, images_kwargs={"do_image_splitting": True}
+            )
+            self.assertEqual(num_patches_and_row_cols, (5, 2, 2))
+
+            num_patches_and_row_cols = image_processing.get_number_of_image_patches(
+                height=300,
+                width=600,
+                images_kwargs={"do_image_splitting": True, "max_image_size": {"longest_edge": 30}},
+            )
+            self.assertEqual(num_patches_and_row_cols, (3, 1, 2))
diff --git a/tests/models/idefics3/test_processor_idefics3.py b/tests/models/idefics3/test_processor_idefics3.py
index 99b931a12c..7020a24398 100644
--- a/tests/models/idefics3/test_processor_idefics3.py
+++ b/tests/models/idefics3/test_processor_idefics3.py
@@ -84,6 +84,19 @@ class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     def prepare_processor_dict():
         return {"image_seq_len": 2}
 
+    # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
+    def test_get_num_vision_tokens(self):
+        "Tests general functionality of the helper used internally in vLLM"
+
+        processor = self.get_processor()
+
+        output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
+        self.assertTrue("num_image_tokens" in output)
+        self.assertEqual(len(output["num_image_tokens"]), 3)
+
+        self.assertTrue("num_image_patches" in output)
+        self.assertEqual(len(output["num_image_patches"]), 3)
+
     def get_split_image_expected_tokens(self, processor, image_rows, image_cols):
         text_split_images = []
         for n_h in range(image_rows):
diff --git a/tests/models/internvl/test_processor_internvl.py b/tests/models/internvl/test_processor_internvl.py
index 614c5b4866..f3340e8af0 100644
--- a/tests/models/internvl/test_processor_internvl.py
+++ b/tests/models/internvl/test_processor_internvl.py
@@ -97,6 +97,19 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     def tearDownClass(cls):
         shutil.rmtree(cls.tmpdirname, ignore_errors=True)
 
+    # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
+    def test_get_num_vision_tokens(self):
+        "Tests general functionality of the helper used internally in vLLM"
+
+        processor = self.get_processor()
+
+        output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
+        self.assertTrue("num_image_tokens" in output)
+        self.assertEqual(len(output["num_image_tokens"]), 3)
+
+        self.assertTrue("num_image_patches" in output)
+        self.assertEqual(len(output["num_image_patches"]), 3)
+
     @require_av
     @require_torch
     def test_process_interleaved_images_videos(self):
diff --git a/tests/models/llava/test_processor_llava.py b/tests/models/llava/test_processor_llava.py
index d89601d78b..41b9d8a09e 100644
--- a/tests/models/llava/test_processor_llava.py
+++ b/tests/models/llava/test_processor_llava.py
@@ -61,6 +61,18 @@ class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
             "vision_feature_select_strategy": "default"
         }  # fmt: skip
 
+    def test_get_num_vision_tokens(self):
+        "Tests general functionality of the helper used internally in vLLM"
+
+        processor = self.get_processor()
+
+        output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
+        self.assertTrue("num_image_tokens" in output)
+        self.assertEqual(len(output["num_image_tokens"]), 3)
+
+        self.assertTrue("num_image_patches" in output)
+        self.assertEqual(len(output["num_image_patches"]), 3)
+
     def test_chat_template_is_saved(self):
         processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
         processor_dict_loaded = json.loads(processor_loaded.to_json_string())
diff --git a/tests/models/llava_next/test_processor_llava_next.py b/tests/models/llava_next/test_processor_llava_next.py
index 2adf527d78..d6156adb75 100644
--- a/tests/models/llava_next/test_processor_llava_next.py
+++ b/tests/models/llava_next/test_processor_llava_next.py
@@ -66,6 +66,19 @@ class LlavaNextProcessorTest(ProcessorTesterMixin, unittest.TestCase):
             "vision_feature_select_strategy": "default"
         }  # fmt: skip
 
+    # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
+    def test_get_num_vision_tokens(self):
+        "Tests general functionality of the helper used internally in vLLM"
+
+        processor = self.get_processor()
+
+        output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
+        self.assertTrue("num_image_tokens" in output)
+        self.assertEqual(len(output["num_image_tokens"]), 3)
+
+        self.assertTrue("num_image_patches" in output)
+        self.assertEqual(len(output["num_image_patches"]), 3)
+
     # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved
     def test_chat_template_is_saved(self):
         processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
diff --git a/tests/models/llava_next_video/test_processor_llava_next_video.py b/tests/models/llava_next_video/test_processor_llava_next_video.py
index b902b8c496..17bcb3657d 100644
--- a/tests/models/llava_next_video/test_processor_llava_next_video.py
+++ b/tests/models/llava_next_video/test_processor_llava_next_video.py
@@ -75,6 +75,19 @@ class LlavaNextVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
             "vision_feature_select_strategy": "default",
         }
 
+    # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
+    def test_get_num_vision_tokens(self):
+        "Tests general functionality of the helper used internally in vLLM"
+
+        processor = self.get_processor()
+
+        output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
+        self.assertTrue("num_image_tokens" in output)
+        self.assertEqual(len(output["num_image_tokens"]), 3)
+
+        self.assertTrue("num_image_patches" in output)
+        self.assertEqual(len(output["num_image_patches"]), 3)
+
     # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved
     def test_chat_template_is_saved(self):
         processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
diff --git a/tests/models/llava_onevision/test_processor_llava_onevision.py b/tests/models/llava_onevision/test_processor_llava_onevision.py
index 52f2b99f92..1eb3b0d0d4 100644
--- a/tests/models/llava_onevision/test_processor_llava_onevision.py
+++ b/tests/models/llava_onevision/test_processor_llava_onevision.py
@@ -79,6 +79,19 @@ class LlavaOnevisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
             "vision_feature_select_strategy": "default"
         }  # fmt: skip
 
+    # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
+    def test_get_num_vision_tokens(self):
+        "Tests general functionality of the helper used internally in vLLM"
+
+        processor = self.get_processor()
+
+        output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
+        self.assertTrue("num_image_tokens" in output)
+        self.assertEqual(len(output["num_image_tokens"]), 3)
+
+        self.assertTrue("num_image_patches" in output)
+        self.assertEqual(len(output["num_image_patches"]), 3)
+
     # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved
     def test_chat_template_is_saved(self):
         processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
diff --git a/tests/models/paligemma/test_processor_paligemma.py b/tests/models/paligemma/test_processor_paligemma.py
index 56e7492892..821e18d550 100644
--- a/tests/models/paligemma/test_processor_paligemma.py
+++ b/tests/models/paligemma/test_processor_paligemma.py
@@ -48,6 +48,19 @@ class PaliGemmaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     def tearDownClass(cls):
         shutil.rmtree(cls.tmpdirname, ignore_errors=True)
 
+    # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
+    def test_get_num_vision_tokens(self):
+        "Tests general functionality of the helper used internally in vLLM"
+
+        processor = self.get_processor()
+
+        output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
+        self.assertTrue("num_image_tokens" in output)
+        self.assertEqual(len(output["num_image_tokens"]), 3)
+
+        self.assertTrue("num_image_patches" in output)
+        self.assertEqual(len(output["num_image_patches"]), 3)
+
     @require_torch
     @require_vision
     def test_image_seq_length(self):
diff --git a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
index b8aa49b004..c3f478950f 100644
--- a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
+++ b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
@@ -65,6 +65,19 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     def tearDownClass(cls):
         shutil.rmtree(cls.tmpdirname, ignore_errors=True)
 
+    # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
+    def test_get_num_vision_tokens(self):
+        "Tests general functionality of the helper used internally in vLLM"
+
+        processor = self.get_processor()
+
+        output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
+        self.assertTrue("num_image_tokens" in output)
+        self.assertEqual(len(output["num_image_tokens"]), 3)
+
+        self.assertTrue("num_image_patches" in output)
+        self.assertEqual(len(output["num_image_patches"]), 3)
+
     def test_save_load_pretrained_default(self):
         tokenizer = self.get_tokenizer()
         image_processor = self.get_image_processor()
diff --git a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
index 6ff2fa70c0..d17cd690cf 100644
--- a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
@@ -394,3 +394,17 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
         self._assert_slow_fast_tensors_equivalence(
             encoding_slow.image_grid_thw.float(), encoding_fast.image_grid_thw.float()
         )
+
+    def test_get_num_patches_without_images(self):
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            num_patches = image_processing.get_number_of_image_patches(height=100, width=100, images_kwargs={})
+            self.assertEqual(num_patches, 64)
+
+            num_patches = image_processing.get_number_of_image_patches(height=200, width=50, images_kwargs={})
+            self.assertEqual(num_patches, 56)
+
+            num_patches = image_processing.get_number_of_image_patches(
+                height=100, width=100, images_kwargs={"patch_size": 28}
+            )
+            self.assertEqual(num_patches, 16)
diff --git a/tests/models/qwen2_vl/test_processor_qwen2_vl.py b/tests/models/qwen2_vl/test_processor_qwen2_vl.py
index eb5fdc79d0..69fae59595 100644
--- a/tests/models/qwen2_vl/test_processor_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_processor_qwen2_vl.py
@@ -68,6 +68,19 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     def tearDownClass(cls):
         shutil.rmtree(cls.tmpdirname, ignore_errors=True)
 
+    # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
+    def test_get_num_vision_tokens(self):
+        "Tests general functionality of the helper used internally in vLLM"
+
+        processor = self.get_processor()
+
+        output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
+        self.assertTrue("num_image_tokens" in output)
+        self.assertEqual(len(output["num_image_tokens"]), 3)
+
+        self.assertTrue("num_image_patches" in output)
+        self.assertEqual(len(output["num_image_patches"]), 3)
+
     def test_save_load_pretrained_default(self):
         tokenizer = self.get_tokenizer()
         image_processor = self.get_image_processor()
diff --git a/tests/models/smolvlm/test_image_processing_smolvlm.py b/tests/models/smolvlm/test_image_processing_smolvlm.py
index 34b893f817..be687e7952 100644
--- a/tests/models/smolvlm/test_image_processing_smolvlm.py
+++ b/tests/models/smolvlm/test_image_processing_smolvlm.py
@@ -358,3 +358,28 @@ class SmolVLMImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
         )
         self.assertEqual(encoding_slow.rows, encoding_fast.rows)
         self.assertEqual(encoding_slow.cols, encoding_fast.cols)
+
+    def test_get_num_patches_without_images(self):
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            num_patches_and_row_cols = image_processing.get_number_of_image_patches(
+                height=100, width=100, images_kwargs={}
+            )
+            self.assertEqual(num_patches_and_row_cols, (5, 2, 2))
+
+            num_patches_and_row_cols = image_processing.get_number_of_image_patches(
+                height=300, width=500, images_kwargs={"do_image_splitting": False}
+            )
+            self.assertEqual(num_patches_and_row_cols, (1, 1, 1))
+
+            num_patches_and_row_cols = image_processing.get_number_of_image_patches(
+                height=300, width=500, images_kwargs={"do_image_splitting": True}
+            )
+            self.assertEqual(num_patches_and_row_cols, (5, 2, 2))
+
+            num_patches_and_row_cols = image_processing.get_number_of_image_patches(
+                height=300,
+                width=600,
+                images_kwargs={"do_image_splitting": True, "max_image_size": {"longest_edge": 30}},
+            )
+            self.assertEqual(num_patches_and_row_cols, (3, 1, 2))