[processor] clean up mulitmodal tests (#37362)

* clkea up mulitmodal processor tests * fixup * fix tests * fix one last test * forgot
2025-04-11 13:32:19 +02:00
parent 3c39c07939
commit a563999a02
30 changed files with 304 additions and 817 deletions
--- a/src/transformers/models/aria/image_processing_aria.py
+++ b/src/transformers/models/aria/image_processing_aria.py
@@ -31,12 +31,16 @@ from ...image_utils import (
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
    is_scaled_image,
    make_flat_list_of_images,
    to_numpy_array,
    valid_images,
    validate_preprocess_arguments,
 )
-from ...utils import TensorType
+from ...utils import TensorType, logging
 logger = logging.get_logger(__name__)
 def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> List[np.array]:
@@ -104,6 +108,12 @@ class AriaImageProcessor(BaseImageProcessor):
            Whether to split the image.
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
            the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
            method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image.
        resample (PILImageResampling, *optional*, defaults to `BICUBIC`):
@@ -121,6 +131,8 @@ class AriaImageProcessor(BaseImageProcessor):
        split_resolutions: Optional[List[Tuple[int, int]]] = None,
        split_image: Optional[bool] = False,
        do_convert_rgb: Optional[bool] = True,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1 / 255,
        do_normalize: Optional[bool] = True,
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        **kwargs,
@@ -141,6 +153,8 @@ class AriaImageProcessor(BaseImageProcessor):
            split_resolutions = [(el[0] * 490, el[1] * 490) for el in split_resolutions]
        self.split_resolutions = split_resolutions
        self.do_convert_rgb = do_convert_rgb
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.resample = resample
@@ -153,6 +167,8 @@ class AriaImageProcessor(BaseImageProcessor):
        min_image_size: Optional[int] = None,
        split_image: Optional[bool] = None,
        do_convert_rgb: Optional[bool] = None,
        do_rescale: Optional[bool] = None,
        rescale_factor: Optional[float] = None,
        do_normalize: Optional[bool] = None,
        resample: PILImageResampling = None,
        return_tensors: Optional[Union[str, TensorType]] = "pt",
@@ -177,6 +193,10 @@ class AriaImageProcessor(BaseImageProcessor):
                Whether to split the image.
            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb` (True)):
                Whether to convert the image to RGB.
            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                Whether to rescale the image.
            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize` (True)):
                Whether to normalize the image.
            resample (PILImageResampling, *optional*, defaults to `self.resample` (BICUBIC)):
@@ -217,6 +237,8 @@ class AriaImageProcessor(BaseImageProcessor):
        min_image_size = min_image_size if min_image_size is not None else self.min_image_size
        split_image = split_image if split_image is not None else self.split_image
        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
        resample = resample if resample is not None else self.resample
@@ -236,6 +258,8 @@ class AriaImageProcessor(BaseImageProcessor):
            image_mean=image_mean,
            image_std=image_std,
            resample=resample,
            do_rescale=do_rescale,
            rescale_factor=rescale_factor,
        )
        if do_convert_rgb:
@@ -244,6 +268,12 @@ class AriaImageProcessor(BaseImageProcessor):
        # All transformations expect numpy arrays.
        images = [to_numpy_array(image) for image in images]
        if do_rescale and is_scaled_image(images[0]):
            logger.warning_once(
                "It looks like you are trying to rescale already rescaled images. If the input"
                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
            )
        if input_data_format is None:
            # We assume that all images have the same channel dimension format.
            input_data_format = infer_channel_dimension_format(images[0])
@@ -297,9 +327,14 @@ class AriaImageProcessor(BaseImageProcessor):
                pixel_mask[: new_size[0], : new_size[1]] = 1
                pixel_masks.append(pixel_mask)
                if do_rescale:
                    crop_image_padded = self.rescale(
                        image=crop_image_padded, scale=rescale_factor, input_data_format=input_data_format
                    )
                if do_normalize:
                    crop_image_padded = self.normalize(
-                        crop_image_padded / 255.0,
+                        crop_image_padded,
                        self.image_mean,
                        self.image_std,
                        data_format=input_data_format,
--- a/src/transformers/models/aria/modular_aria.py
+++ b/src/transformers/models/aria/modular_aria.py
@@ -28,6 +28,7 @@ from ...image_utils import (
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
    is_scaled_image,
    make_flat_list_of_images,
    to_numpy_array,
    valid_images,
@@ -495,6 +496,12 @@ class AriaImageProcessor(BaseImageProcessor):
            Whether to split the image.
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
            the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
            method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image.
        resample (PILImageResampling, *optional*, defaults to `BICUBIC`):
@@ -512,6 +519,8 @@ class AriaImageProcessor(BaseImageProcessor):
        split_resolutions: Optional[List[Tuple[int, int]]] = None,
        split_image: Optional[bool] = False,
        do_convert_rgb: Optional[bool] = True,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1 / 255,
        do_normalize: Optional[bool] = True,
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        **kwargs,
@@ -532,6 +541,8 @@ class AriaImageProcessor(BaseImageProcessor):
            split_resolutions = [(el[0] * 490, el[1] * 490) for el in split_resolutions]
        self.split_resolutions = split_resolutions
        self.do_convert_rgb = do_convert_rgb
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.resample = resample
@@ -544,6 +555,8 @@ class AriaImageProcessor(BaseImageProcessor):
        min_image_size: Optional[int] = None,
        split_image: Optional[bool] = None,
        do_convert_rgb: Optional[bool] = None,
        do_rescale: Optional[bool] = None,
        rescale_factor: Optional[float] = None,
        do_normalize: Optional[bool] = None,
        resample: PILImageResampling = None,
        return_tensors: Optional[Union[str, TensorType]] = "pt",
@@ -568,6 +581,10 @@ class AriaImageProcessor(BaseImageProcessor):
                Whether to split the image.
            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb` (True)):
                Whether to convert the image to RGB.
            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                Whether to rescale the image.
            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize` (True)):
                Whether to normalize the image.
            resample (PILImageResampling, *optional*, defaults to `self.resample` (BICUBIC)):
@@ -608,6 +625,8 @@ class AriaImageProcessor(BaseImageProcessor):
        min_image_size = min_image_size if min_image_size is not None else self.min_image_size
        split_image = split_image if split_image is not None else self.split_image
        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
        resample = resample if resample is not None else self.resample
@@ -627,6 +646,8 @@ class AriaImageProcessor(BaseImageProcessor):
            image_mean=image_mean,
            image_std=image_std,
            resample=resample,
            do_rescale=do_rescale,
            rescale_factor=rescale_factor,
        )
        if do_convert_rgb:
@@ -635,6 +656,12 @@ class AriaImageProcessor(BaseImageProcessor):
        # All transformations expect numpy arrays.
        images = [to_numpy_array(image) for image in images]
        if do_rescale and is_scaled_image(images[0]):
            logger.warning_once(
                "It looks like you are trying to rescale already rescaled images. If the input"
                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
            )
        if input_data_format is None:
            # We assume that all images have the same channel dimension format.
            input_data_format = infer_channel_dimension_format(images[0])
@@ -688,9 +715,14 @@ class AriaImageProcessor(BaseImageProcessor):
                pixel_mask[: new_size[0], : new_size[1]] = 1
                pixel_masks.append(pixel_mask)
                if do_rescale:
                    crop_image_padded = self.rescale(
                        image=crop_image_padded, scale=rescale_factor, input_data_format=input_data_format
                    )
                if do_normalize:
                    crop_image_padded = self.normalize(
-                        crop_image_padded / 255.0,
+                        crop_image_padded,
                        self.image_mean,
                        self.image_std,
                        data_format=input_data_format,
--- a/src/transformers/models/colpali/processing_colpali.py
+++ b/src/transformers/models/colpali/processing_colpali.py
@@ -118,8 +118,10 @@ class ColPaliProcessor(ProcessorMixin):
            tokens_to_add = {"additional_special_tokens": [image_token]}
            tokenizer.add_special_tokens(tokens_to_add)
            self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
            self.image_token = IMAGE_TOKEN
        else:
            self.image_token_id = tokenizer.image_token_id
            self.image_token = tokenizer.image_token
        tokenizer.add_tokens(EXTRA_TOKENS)
        tokenizer.add_bos_token = False
--- a/src/transformers/models/idefics/image_processing_idefics.py
+++ b/src/transformers/models/idefics/image_processing_idefics.py
@@ -65,6 +65,12 @@ class IdeficsImageProcessor(BaseImageProcessor):
            Can be overridden by the `image_std` parameter in the `preprocess` method.
        image_num_channels (`int`, *optional*, defaults to 3):
            Number of image channels.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
            the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
            method.
    """
    model_input_names = ["pixel_values"]
@@ -75,14 +81,18 @@ class IdeficsImageProcessor(BaseImageProcessor):
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        image_num_channels: Optional[int] = 3,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1 / 255,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self.image_size = image_size
        self.image_num_channels = image_num_channels
-        self.image_mean = image_mean
+        self.image_mean = image_mean if image_mean is not None else IDEFICS_STANDARD_MEAN
-        self.image_std = image_std
+        self.image_std = image_std if image_std is not None else IDEFICS_STANDARD_STD
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
    def preprocess(
        self,
@@ -92,6 +102,8 @@ class IdeficsImageProcessor(BaseImageProcessor):
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        transform: Callable = None,
        do_rescale: Optional[bool] = None,
        rescale_factor: Optional[float] = None,
        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
        **kwargs,
    ) -> TensorType:
@@ -117,6 +129,12 @@ class IdeficsImageProcessor(BaseImageProcessor):
                A custom transform function that accepts a single image can be passed for training. For example,
                `torchvision.Compose` can be used to compose multiple transforms. If `None` - an inference mode is
                assumed - and then a preset of inference-specific transforms will be applied to the images
            do_rescale (`bool`, *optional*, defaults to `True`):
                Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
                the `preprocess` method.
            rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
                Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
                method.
        Returns:
            a PyTorch tensor of the processed images
@@ -126,6 +144,8 @@ class IdeficsImageProcessor(BaseImageProcessor):
        image_num_channels = image_num_channels if image_num_channels is not None else self.image_num_channels
        image_mean = image_mean if image_mean is not None else self.image_mean
        image_std = image_std if image_std is not None else self.image_std
        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
        size = (image_size, image_size)
        if isinstance(images, list) and len(images) == 0:
@@ -160,7 +180,7 @@ class IdeficsImageProcessor(BaseImageProcessor):
        # further transforms expect numpy arrays
        images = [to_numpy_array(x) for x in images]
        images = [resize(x, size, resample=PILImageResampling.BICUBIC) for x in images]
-        images = [self.rescale(image=image, scale=1 / 255) for image in images]
+        images = [self.rescale(image=image, scale=rescale_factor) for image in images]
        images = [self.normalize(x, mean=image_mean, std=image_std) for x in images]
        images = [to_channel_dimension_format(x, ChannelDimension.FIRST) for x in images]
        images = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)["pixel_values"]
--- a/src/transformers/models/paligemma/processing_paligemma.py
+++ b/src/transformers/models/paligemma/processing_paligemma.py
@@ -141,8 +141,10 @@ class PaliGemmaProcessor(ProcessorMixin):
            tokens_to_add = {"additional_special_tokens": [image_token]}
            tokenizer.add_special_tokens(tokens_to_add)
            self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
            self.image_token = IMAGE_TOKEN
        else:
            self.image_token_id = tokenizer.image_token_id
            self.image_token = tokenizer.image_token
        tokenizer.add_tokens(EXTRA_TOKENS)
        tokenizer.add_bos_token = False
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -1086,7 +1086,6 @@ class ProcessorMixin(PushToHubMixin):
        args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
        processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
        processor_dict.update({k: v for k, v in kwargs.items() if k in processor_dict.keys()})
        return cls.from_args_and_dict(args, processor_dict, **kwargs)
    @classmethod
--- a/tests/models/aria/test_processor_aria.py
+++ b/tests/models/aria/test_processor_aria.py
@@ -16,7 +16,6 @@ import shutil
 import tempfile
 import unittest
 from io import BytesIO
 from typing import Optional
 import numpy as np
 import requests
@@ -41,7 +40,7 @@ class AriaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.tmpdirname = tempfile.mkdtemp()
-        processor = AriaProcessor.from_pretrained("m-ric/Aria_hf_2", image_seq_len=2)
+        processor = AriaProcessor.from_pretrained("m-ric/Aria_hf_2", size_conversion={490: 2, 980: 2})
        processor.save_pretrained(cls.tmpdirname)
        cls.image1 = Image.open(
            BytesIO(
@@ -74,7 +73,14 @@ class AriaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        cls.fake_image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.fake_image_token)
        cls.global_img_tokens_id = processor.tokenizer(cls.global_img_token, add_special_tokens=False)["input_ids"]
        cls.padding_token_id = processor.tokenizer.pad_token_id
-        cls.image_seq_len = 256
+        cls.image_seq_len = 2
    @staticmethod
    def prepare_processor_dict():
        return {
            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}{% elif message['content'] is iterable %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<fim_prefix><|img|><fim_suffix>{% endif %}{% endfor %}{% endif %}<|im_end|>\n{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
            "size_conversion": {490: 2, 980: 2},
        }  # fmt: skip
    def get_tokenizer(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
@@ -89,24 +95,6 @@ class AriaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def tearDownClass(cls):
        shutil.rmtree(cls.tmpdirname, ignore_errors=True)
    def test_kwargs_overrides_default_image_processor_kwargs(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        processor_components = self.prepare_components()
        processor_components["image_processor"] = self.get_component(
            "image_processor", do_rescale=True, rescale_factor=1
        )
        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
        processor = self.processor_class(**processor_components)
        self.skip_processor_without_typed_kwargs(processor)
        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
        self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
    def test_process_interleaved_images_prompts_image_splitting(self):
        processor = self.get_processor()
        processor.image_processor.split_image = True
@@ -236,155 +224,50 @@ And who is that?<|im_end|>
 """
        self.assertEqual(rendered, expected_rendered)
-    # Override as AriaProcessor needs image tokens in prompts
+    def test_image_chat_template_accepts_processing_kwargs(self):
-    def prepare_text_inputs(self, batch_size: Optional[int] = None):
+        processor = self.get_processor()
-        if batch_size is None:
+        if processor.chat_template is None:
-            return "lower newer <|img|>"
+            self.skipTest("Processor has no chat template")
-        if batch_size < 1:
+        messages = [
-            raise ValueError("batch_size must be greater than 0")
+            [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "What is shown in this image?"},
                    ],
                },
            ]
        ]
-        if batch_size == 1:
+        formatted_prompt_tokenized = processor.apply_chat_template(
-            return ["lower newer <|img|>"]
+            messages,
-        return ["lower newer <|img|>", "<|img|> upper older longer string"] + ["<|img|> lower newer"] * (
+            add_generation_prompt=True,
-            batch_size - 2
+            tokenize=True,
        )
    # Override tests as inputs_ids padded dimension is the second one but not the last one
    @require_vision
    @require_torch
    def test_kwargs_overrides_default_tokenizer_kwargs(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
        tokenizer = self.get_component("tokenizer", max_length=30)
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=30)
        self.assertEqual(len(inputs["input_ids"][0]), 30)
    @require_torch
    @require_vision
    def test_structured_kwargs_nested(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
        tokenizer = self.get_component("tokenizer")
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        # Define the kwargs for each modality
        inputs = processor(
            text=input_str,
            images=image_input,
            common_kwargs={"return_tensors": "pt"},
            images_kwargs={"max_image_size": 980},
            text_kwargs={"padding": "max_length", "max_length": 120, "truncation": "longest_first"},
        )
        self.skip_processor_without_typed_kwargs(processor)
        self.assertEqual(inputs["pixel_values"].shape[3], 980)
        self.assertEqual(len(inputs["input_ids"][0]), 120)
    @require_torch
    @require_vision
    def test_structured_kwargs_nested_from_dict(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
        tokenizer = self.get_component("tokenizer")
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        # Define the kwargs for each modality
        all_kwargs = {
            "common_kwargs": {"return_tensors": "pt"},
            "images_kwargs": {"max_image_size": 980},
            "text_kwargs": {"padding": "max_length", "max_length": 120, "truncation": "longest_first"},
        }
        inputs = processor(text=input_str, images=image_input, **all_kwargs)
        self.assertEqual(inputs["pixel_values"].shape[3], 980)
        self.assertEqual(len(inputs["input_ids"][0]), 120)
    @require_vision
    @require_torch
    def test_tokenizer_defaults_preserved_by_kwargs(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
        tokenizer = self.get_component("tokenizer", max_length=30)
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
        self.assertEqual(len(inputs["input_ids"][0]), 30)
    @require_torch
    @require_vision
    def test_unstructured_kwargs_batched(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
        tokenizer = self.get_component("tokenizer")
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
        input_str = self.prepare_text_inputs(batch_size=2)
        image_input = self.prepare_image_inputs(batch_size=2)
        inputs = processor(
            text=input_str,
            images=image_input,
            return_tensors="pt",
            padding="longest",
            max_length=76,
            truncation=True,
            max_image_size=980,
        )
        self.assertEqual(inputs["pixel_values"].shape[1], 3)
        self.assertEqual(inputs["pixel_values"].shape[3], 980)
        self.assertEqual(len(inputs["input_ids"][0]), 76)
    @require_torch
    @require_vision
    def test_unstructured_kwargs(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
        tokenizer = self.get_component("tokenizer")
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        inputs = processor(
            text=input_str,
            images=image_input,
            return_tensors="pt",
            max_image_size=980,
            padding="max_length",
-            max_length=120,
+            max_length=50,
            truncation="longest_first",
        )
        self.assertEqual(len(formatted_prompt_tokenized[0]), 50)
-        self.assertEqual(inputs["pixel_values"].shape[3], 980)
+        formatted_prompt_tokenized = processor.apply_chat_template(
-        self.assertEqual(len(inputs["input_ids"][0]), 120)
+            messages,
            add_generation_prompt=True,
            tokenize=True,
            truncation=True,
            max_length=5,
        )
        self.assertEqual(len(formatted_prompt_tokenized[0]), 5)
        # Now test the ability to return dict
        messages[0][0]["content"].append(
            {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}
        )
        out_dict = processor.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            max_image_size=980,
            return_tensors="np",
        )
        self.assertListEqual(list(out_dict[self.images_input_name].shape), [1, 3, 980, 980])
--- a/tests/models/aya_vision/test_processor_aya_vision.py
+++ b/tests/models/aya_vision/test_processor_aya_vision.py
@@ -15,7 +15,6 @@
 import shutil
 import tempfile
 import unittest
 from typing import Optional
 from transformers import AutoProcessor, AutoTokenizer, AyaVisionProcessor
 from transformers.testing_utils import require_read_token, require_torch, require_vision
@@ -61,6 +60,7 @@ class AyaVisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            **processor_kwargs,
        )
        processor.save_pretrained(cls.tmpdirname)
        cls.image_token = processor.image_token
    @staticmethod
    def prepare_processor_dict():
@@ -79,20 +79,6 @@ class AyaVisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def tearDownClass(cls):
        shutil.rmtree(cls.tmpdirname, ignore_errors=True)
    # Override as AyaVisionProcessor needs image tokens in prompts
    def prepare_text_inputs(self, batch_size: Optional[int] = None):
        if batch_size is None:
            return "lower newer <image>"
        if batch_size < 1:
            raise ValueError("batch_size must be greater than 0")
        if batch_size == 1:
            return ["lower newer <image>"]
        return ["lower newer <image>", "<image> upper older longer string"] + ["<image> lower newer"] * (
            batch_size - 2
        )
    @require_torch
    def test_process_interleaved_images_videos(self):
        processor = self.get_processor()
--- a/tests/models/chameleon/test_processor_chameleon.py
+++ b/tests/models/chameleon/test_processor_chameleon.py
@@ -40,5 +40,10 @@ class ChameleonProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        tokenizer = LlamaTokenizer(vocab_file=SAMPLE_VOCAB)
        tokenizer.pad_token_id = 0
        tokenizer.sep_token_id = 1
-        processor = cls.processor_class(image_processor=image_processor, tokenizer=tokenizer)
+        processor = cls.processor_class(image_processor=image_processor, tokenizer=tokenizer, image_seq_length=2)
        processor.save_pretrained(cls.tmpdirname)
        cls.image_token = processor.image_token
    @staticmethod
    def prepare_processor_dict():
        return {"image_seq_length": 2}  # fmt: skip
--- a/tests/models/emu3/test_processor_emu3.py
+++ b/tests/models/emu3/test_processor_emu3.py
@@ -34,7 +34,7 @@ class Emu3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.tmpdirname = tempfile.mkdtemp()
-        image_processor = Emu3ImageProcessor()
+        image_processor = Emu3ImageProcessor(min_pixels=28 * 28, max_pixels=56 * 56)
        extra_special_tokens = extra_special_tokens = {
            "image_token": "<image>",
            "boi_token": "<|image start|>",
@@ -51,8 +51,10 @@ class Emu3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            image_processor=image_processor, tokenizer=tokenizer, chat_template="dummy_template"
        )
        processor.save_pretrained(cls.tmpdirname)
        cls.image_token = processor.image_token
-    def prepare_processor_dict(self):
+    @staticmethod
    def prepare_processor_dict():
        return {
            "chat_template": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}",
        }  # fmt: skip
--- a/tests/models/fuyu/test_processor_fuyu.py
+++ b/tests/models/fuyu/test_processor_fuyu.py
@@ -332,7 +332,7 @@ class FuyuProcessingTest(ProcessorTesterMixin, unittest.TestCase):
            max_length=76,
        )
-        self.assertEqual(len(inputs["input_ids"][0]), 6)
+        self.assertEqual(len(inputs["input_ids"][0]), 7)
@require_torch
--- a/tests/models/gemma3/test_processing_gemma3.py
+++ b/tests/models/gemma3/test_processing_gemma3.py
@@ -56,6 +56,7 @@ class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        processor_kwargs = cls.prepare_processor_dict()
        processor = Gemma3Processor(image_processor=image_processor, tokenizer=tokenizer, **processor_kwargs)
        processor.save_pretrained(cls.tmpdirname)
        cls.image_token = processor.boi_token
    @classmethod
    def tearDownClass(cls):
@@ -68,20 +69,6 @@ class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n    {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n    {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n    {%- set first_user_prefix = \"\" -%}\n    {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n        {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n    {%- endif -%}\n    {%- if (message['role'] == 'assistant') -%}\n        {%- set role = \"model\" -%}\n    {%- else -%}\n        {%- set role = message['role'] -%}\n    {%- endif -%}\n    {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n    {%- if message['content'] is string -%}\n        {{ message['content'] | trim }}\n    {%- elif message['content'] is iterable -%}\n        {%- for item in message['content'] -%}\n            {%- if item['type'] == 'image' -%}\n                {{ '<start_of_image>' }}\n            {%- elif item['type'] == 'text' -%}\n                {{ item['text'] | trim }}\n            {%- endif -%}\n        {%- endfor -%}\n    {%- else -%}\n        {{ raise_exception(\"Invalid content type\") }}\n    {%- endif -%}\n    {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n    {{'<start_of_turn>model\n'}}\n{%- endif -%}\n",            "image_seq_length": 3,
        }  # fmt: skip
    # Override as VLMs need image tokens in prompts
    def prepare_text_inputs(self, batch_size: Optional[int] = None):
        if batch_size is None:
            return "lower newer <start_of_image>"
        if batch_size < 1:
            raise ValueError("batch_size must be greater than 0")
        if batch_size == 1:
            return ["lower newer <start_of_image>"]
        return ["lower newer <start_of_image>", "<start_of_image> upper older longer string"] + [
            "<start_of_image> lower newer"
        ] * (batch_size - 2)
    # Override as Gemma3 needs images to be an explicitly nested batch
    def prepare_image_inputs(self, batch_size: Optional[int] = None):
        """This function prepares a list of PIL images for testing"""
@@ -123,7 +110,7 @@ class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        processor_kwargs = self.prepare_processor_dict()
        processor = self.processor_class(**processor_components, **processor_kwargs)
-        input_str = self.prepare_text_inputs()
+        input_str = self.prepare_text_inputs(modality="image")
        image_input = self.prepare_image_inputs()
        inputs = processor(
            text=input_str,
--- a/tests/models/got_ocr2/test_processor_got_ocr2.py
+++ b/tests/models/got_ocr2/test_processor_got_ocr2.py
@@ -40,6 +40,7 @@ class GotOcr2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        processor_kwargs = {}
        processor = GotOcr2Processor(image_processor, tokenizer, **processor_kwargs)
        processor.save_pretrained(cls.tmpdirname)
        cls.image_token = processor.img_pad_token
    def get_tokenizer(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
--- a/tests/models/grounding_dino/test_processor_grounding_dino.py
+++ b/tests/models/grounding_dino/test_processor_grounding_dino.py
@@ -79,7 +79,7 @@ class GroundingDinoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        cls.embed_dim = 5
        cls.seq_length = 5
-    def prepare_text_inputs(self, batch_size: Optional[int] = None):
+    def prepare_text_inputs(self, batch_size: Optional[int] = None, modality: Optional[str] = None):
        labels = ["a cat", "remote control"]
        labels_longer = ["a person", "a car", "a dog", "a cat"]
--- a/tests/models/idefics/test_processor_idefics.py
+++ b/tests/models/idefics/test_processor_idefics.py
@@ -219,139 +219,3 @@ class IdeficsProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        # For now the processor supports only ['pixel_values', 'input_ids', 'attention_mask']
        self.assertSetEqual(set(inputs.keys()), set(self.input_keys))
    # Override the following tests as Idefics image processor does not accept do_rescale and rescale_factor
    @require_torch
    @require_vision
    def test_image_processor_defaults_preserved_by_image_kwargs(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor", image_size=234)
        tokenizer = self.get_component("tokenizer", max_length=117)
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        inputs = processor(text=input_str, images=image_input)
        self.assertEqual(len(inputs["pixel_values"][0][0][0]), 234)
    @require_torch
    @require_vision
    def test_kwargs_overrides_default_image_processor_kwargs(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor", image_size=234)
        tokenizer = self.get_component("tokenizer", max_length=117)
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        inputs = processor(text=input_str, images=image_input, image_size=224)
        self.assertEqual(len(inputs["pixel_values"][0][0][0]), 224)
    @require_torch
    @require_vision
    def test_unstructured_kwargs(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
        tokenizer = self.get_component("tokenizer")
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        inputs = processor(
            text=input_str,
            images=image_input,
            return_tensors="pt",
            image_size=214,
            padding="max_length",
            max_length=76,
        )
        self.assertEqual(inputs["pixel_values"].shape[3], 214)
        self.assertEqual(len(inputs["input_ids"][0]), 76)
    @require_torch
    @require_vision
    def test_unstructured_kwargs_batched(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
        tokenizer = self.get_component("tokenizer")
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
        input_str = self.prepare_text_inputs(batch_size=2)
        image_input = self.prepare_image_inputs(batch_size=2)
        inputs = processor(
            text=input_str,
            images=image_input,
            return_tensors="pt",
            image_size=214,
            padding="longest",
            max_length=76,
        )
        self.assertEqual(inputs["pixel_values"].shape[3], 214)
        self.assertEqual(len(inputs["input_ids"][0]), 8)
    @require_torch
    @require_vision
    def test_structured_kwargs_nested(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
        tokenizer = self.get_component("tokenizer")
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        # Define the kwargs for each modality
        all_kwargs = {
            "common_kwargs": {"return_tensors": "pt"},
            "images_kwargs": {"image_size": 214},
            "text_kwargs": {"padding": "max_length", "max_length": 76},
        }
        inputs = processor(text=input_str, images=image_input, **all_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
        self.assertEqual(inputs["pixel_values"].shape[3], 214)
        self.assertEqual(len(inputs["input_ids"][0]), 76)
    @require_torch
    @require_vision
    def test_structured_kwargs_nested_from_dict(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
        tokenizer = self.get_component("tokenizer")
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        # Define the kwargs for each modality
        all_kwargs = {
            "common_kwargs": {"return_tensors": "pt"},
            "images_kwargs": {"image_size": 214},
            "text_kwargs": {"padding": "max_length", "max_length": 76},
        }
        inputs = processor(text=input_str, images=image_input, **all_kwargs)
        self.assertEqual(inputs["pixel_values"].shape[3], 214)
        self.assertEqual(len(inputs["input_ids"][0]), 76)
--- a/tests/models/idefics2/test_processor_idefics2.py
+++ b/tests/models/idefics2/test_processor_idefics2.py
@@ -16,7 +16,6 @@ import shutil
 import tempfile
 import unittest
 from io import BytesIO
 from typing import Optional
 import requests
@@ -84,6 +83,10 @@ class Idefics2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def get_processor(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
    @staticmethod
    def prepare_processor_dict():
        return {"image_seq_len": 2}
    @classmethod
    def tearDownClass(cls):
        shutil.rmtree(cls.tmpdirname, ignore_errors=True)
@@ -329,17 +332,3 @@ class Idefics2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            "Assistant:"
        )
        self.assertEqual(rendered, expected_rendered)
    # Override as Idefics2Processor needs image tokens in prompts
    def prepare_text_inputs(self, batch_size: Optional[int] = None):
        if batch_size is None:
            return "lower newer <image>"
        if batch_size < 1:
            raise ValueError("batch_size must be greater than 0")
        if batch_size == 1:
            return ["lower newer <image>"]
        return ["lower newer <image>", "<image> upper older longer string"] + ["<image> lower newer"] * (
            batch_size - 2
        )
--- a/tests/models/idefics3/test_processor_idefics3.py
+++ b/tests/models/idefics3/test_processor_idefics3.py
@@ -16,7 +16,6 @@ import shutil
 import tempfile
 import unittest
 from io import BytesIO
 from typing import Optional
 import numpy as np
 import requests
@@ -81,6 +80,10 @@ class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def get_processor(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
    @staticmethod
    def prepare_processor_dict():
        return {"image_seq_len": 2}
    def get_split_image_expected_tokens(self, processor, image_rows, image_cols):
        text_split_images = []
        for n_h in range(image_rows):
@@ -352,159 +355,6 @@ class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        )
        self.assertEqual(rendered, expected_rendered)
    # Override as Idefics3Processor needs image tokens in prompts
    def prepare_text_inputs(self, batch_size: Optional[int] = None):
        if batch_size is None:
            return "lower newer <image>"
        if batch_size < 1:
            raise ValueError("batch_size must be greater than 0")
        if batch_size == 1:
            return ["lower newer <image>"]
        return ["lower newer <image>", "<image> upper older longer string"] + ["<image> lower newer"] * (
            batch_size - 2
        )
    # Override tests as inputs_ids padded dimension is the second one but not the last one
    @require_vision
    @require_torch
    def test_kwargs_overrides_default_tokenizer_kwargs(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
        tokenizer = self.get_component("tokenizer", max_length=30)
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=30)
        self.assertEqual(len(inputs["input_ids"][0]), 30)
    @require_torch
    @require_vision
    def test_structured_kwargs_nested(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
        tokenizer = self.get_component("tokenizer")
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        # Define the kwargs for each modality
        inputs = processor(
            text=input_str,
            images=image_input,
            common_kwargs={"return_tensors": "pt"},
            images_kwargs={"max_image_size": {"longest_edge": 32}},
            text_kwargs={"padding": "max_length", "max_length": 120, "truncation": "longest_first"},
        )
        self.skip_processor_without_typed_kwargs(processor)
        self.assertEqual(inputs["pixel_values"].shape[3], 32)
        self.assertEqual(len(inputs["input_ids"][0]), 120)
    @require_torch
    @require_vision
    def test_structured_kwargs_nested_from_dict(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
        tokenizer = self.get_component("tokenizer")
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        # Define the kwargs for each modality
        all_kwargs = {
            "common_kwargs": {"return_tensors": "pt"},
            "images_kwargs": {"max_image_size": {"longest_edge": 32}},
            "text_kwargs": {"padding": "max_length", "max_length": 120, "truncation": "longest_first"},
        }
        inputs = processor(text=input_str, images=image_input, **all_kwargs)
        self.assertEqual(inputs["pixel_values"].shape[3], 32)
        self.assertEqual(len(inputs["input_ids"][0]), 120)
    @require_vision
    @require_torch
    def test_tokenizer_defaults_preserved_by_kwargs(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
        tokenizer = self.get_component("tokenizer", max_length=30)
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
        self.assertEqual(len(inputs["input_ids"][0]), 30)
    @require_torch
    @require_vision
    def test_unstructured_kwargs_batched(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
        tokenizer = self.get_component("tokenizer")
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
        input_str = self.prepare_text_inputs(batch_size=2)
        image_input = self.prepare_image_inputs(batch_size=2)
        inputs = processor(
            text=input_str,
            images=image_input,
            return_tensors="pt",
            padding="longest",
            max_length=76,
            truncation=True,
            max_image_size={"longest_edge": 30},
        )
        self.assertEqual(inputs["pixel_values"].shape[2], 3)
        self.assertEqual(inputs["pixel_values"].shape[3], 30)
        self.assertEqual(len(inputs["input_ids"][0]), 76)
    @require_torch
    @require_vision
    def test_unstructured_kwargs(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
        tokenizer = self.get_component("tokenizer")
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        inputs = processor(
            text=input_str,
            images=image_input,
            return_tensors="pt",
            max_image_size={"longest_edge": 32},
            padding="max_length",
            max_length=120,
            truncation="longest_first",
        )
        self.assertEqual(inputs["pixel_values"].shape[3], 32)
        self.assertEqual(len(inputs["input_ids"][0]), 120)
    @require_torch
    @require_vision
    def test_text_only_inference(self):
--- a/tests/models/llama4/test_processor_llama4.py
+++ b/tests/models/llama4/test_processor_llama4.py
@@ -15,7 +15,6 @@
 import shutil
 import tempfile
 import unittest
 from typing import Optional
 from transformers import AutoProcessor, Llama4Processor, PreTrainedTokenizerFast
 from transformers.testing_utils import require_vision
@@ -38,9 +37,10 @@ class Llama4ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        image_processor = Llama4ImageProcessorFast(max_patches=1, size={"height": 20, "width": 20})
        tokenizer = PreTrainedTokenizerFast.from_pretrained("unsloth/Llama-3.2-11B-Vision-Instruct-unsloth-bnb-4bit")
-        processor_kwargs = {}
+        processor_kwargs = cls.prepare_processor_dict()
        processor = Llama4Processor(image_processor, tokenizer, **processor_kwargs)
        processor.save_pretrained(cls.tmpdirname)
        cls.image_token = processor.image_token
    def get_tokenizer(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
@@ -51,21 +51,3 @@ class Llama4ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    @classmethod
    def tearDownClass(cls):
        shutil.rmtree(cls.tmpdirname)
    # Override as Llama4Processor needs image tokens in prompts
    def prepare_text_inputs(self, batch_size: Optional[int] = None):
        if batch_size is None:
            return "lower newer <|image|>"
        if batch_size < 1:
            raise ValueError("batch_size must be greater than 0")
        if batch_size == 1:
            return ["lower newer <|image|>"]
        return ["lower newer <|image|>", "<|image|> upper older longer string"] + ["<|image|> lower newer"] * (
            batch_size - 2
        )
    @unittest.skip("This test uses return_tensors='np' which is not supported")
    def test_image_chat_template_accepts_processing_kwargs(self):
        pass
--- a/tests/models/llava/test_processor_llava.py
+++ b/tests/models/llava/test_processor_llava.py
@@ -43,6 +43,7 @@ class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        processor_kwargs = cls.prepare_processor_dict()
        processor = LlavaProcessor(image_processor, tokenizer, **processor_kwargs)
        processor.save_pretrained(cls.tmpdirname)
        cls.image_token = processor.image_token
    def get_tokenizer(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
@@ -58,18 +59,10 @@ class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def prepare_processor_dict():
        return {
            "chat_template": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}",
-            "patch_size": 3,
+            "patch_size": 128,
            "vision_feature_select_strategy": "default"
        }  # fmt: skip
    @unittest.skip(
        "Skip because the model has no processor kwargs except for chat template and"
        "chat template is saved as a separate file. Stop skipping this test when the processor"
        "has new kwargs saved in config file."
    )
    def test_processor_to_json_string(self):
        pass
    def test_chat_template_is_saved(self):
        processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
        processor_dict_loaded = json.loads(processor_loaded.to_json_string())
--- a/tests/models/llava_next/test_processor_llava_next.py
+++ b/tests/models/llava_next/test_processor_llava_next.py
@@ -43,6 +43,7 @@ class LlavaNextProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        processor_kwargs = cls.prepare_processor_dict()
        processor = LlavaNextProcessor(image_processor, tokenizer, **processor_kwargs)
        processor.save_pretrained(cls.tmpdirname)
        cls.image_token = processor.image_token
    def get_tokenizer(self, **kwargs):
        return LlavaNextProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
@@ -54,18 +55,10 @@ class LlavaNextProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def prepare_processor_dict():
        return {
            "chat_template": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}",
-            "patch_size": 3,
+            "patch_size": 128,
            "vision_feature_select_strategy": "default"
        }  # fmt: skip
    @unittest.skip(
        "Skip because the model has no processor kwargs except for chat template and"
        "chat template is saved as a separate file. Stop skipping this test when the processor"
        "has new kwargs saved in config file."
    )
    def test_processor_to_json_string(self):
        pass
    # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved
    def test_chat_template_is_saved(self):
        processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
--- a/tests/models/llava_next_video/test_processor_llava_next_video.py
+++ b/tests/models/llava_next_video/test_processor_llava_next_video.py
@@ -47,6 +47,8 @@ class LlavaNextVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            video_processor=video_processor, image_processor=image_processor, tokenizer=tokenizer, **processor_kwargs
        )
        processor.save_pretrained(cls.tmpdirname)
        cls.image_token = processor.image_token
        cls.video_token = processor.video_token
    def get_tokenizer(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
@@ -61,20 +63,11 @@ class LlavaNextVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def prepare_processor_dict(cls):
        return {
            "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + ' '}}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>' }}{% endfor %}{# Render all video then #}{% for content in message['content'] | selectattr('type', 'equalto', 'video') %}{{ '<video>' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ '\n' + content['text'] }}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ '\n' + content['text'] }}{% endgeneration %}{% endfor %}{% endif %}{{'<|im_end|>'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
-            "num_additional_image_tokens": 6,
+            "num_additional_image_tokens": 0,
-            "patch_size": 4,
+            "patch_size": 128,
            "vision_feature_select_strategy": "default",
        }
    def test_processor_to_json_string(self):
        processor = self.get_processor()
        obj = json.loads(processor.to_json_string())
        for key, value in self.prepare_processor_dict().items():
            # chat_tempalate are tested as a separate test because they are saved in separate files
            if key != "chat_template":
                self.assertEqual(obj[key], value)
                self.assertEqual(getattr(processor, key, None), value)
    # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved
    def test_chat_template_is_saved(self):
        processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
--- a/tests/models/llava_onevision/test_processor_llava_onevision.py
+++ b/tests/models/llava_onevision/test_processor_llava_onevision.py
@@ -51,6 +51,8 @@ class LlavaOnevisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            video_processor=video_processor, image_processor=image_processor, tokenizer=tokenizer, **processor_kwargs
        )
        processor.save_pretrained(cls.tmpdirname)
        cls.image_token = processor.image_token
        cls.video_token = processor.video_token
    def get_tokenizer(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
@@ -73,15 +75,6 @@ class LlavaOnevisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            "vision_feature_select_strategy": "default"
        }  # fmt: skip
    def test_processor_to_json_string(self):
        processor = self.get_processor()
        obj = json.loads(processor.to_json_string())
        for key, value in self.prepare_processor_dict().items():
            # chat_tempalate are tested as a separate test because they are saved in separate files
            if key != "chat_template":
                self.assertEqual(obj[key], value)
                self.assertEqual(getattr(processor, key, None), value)
    # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved
    def test_chat_template_is_saved(self):
        processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
--- a/tests/models/mistral3/test_processor_mistral3.py
+++ b/tests/models/mistral3/test_processor_mistral3.py
@@ -19,7 +19,7 @@ import unittest
 import requests
 from transformers import PixtralProcessor
-from transformers.testing_utils import require_read_token, require_vision
+from transformers.testing_utils import require_vision
 from transformers.utils import is_torch_available, is_vision_available
 from ...test_processing_common import ProcessorTesterMixin
@@ -34,7 +34,6 @@ if is_vision_available():
@require_vision
@require_read_token
 class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    """This tests Pixtral processor with the new `spatial_merge_size` argument in Mistral3."""
@@ -49,30 +48,37 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        cls.url_2 = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
        cls.image_2 = Image.open(requests.get(cls.url_2, stream=True).raw)
-    def setUp(self):
+        cls.tmpdirname = tempfile.mkdtemp()
-        self.tmpdirname = tempfile.mkdtemp()
+        cls.addClassCleanup(lambda tempdir=cls.tmpdirname: shutil.rmtree(tempdir))
        processor_kwargs = cls.prepare_processor_dict()
        processor = PixtralProcessor.from_pretrained(
-            "hf-internal-testing/Mistral-Small-3.1-24B-Instruct-2503-only-processor"
+            "hf-internal-testing/Mistral-Small-3.1-24B-Instruct-2503-only-processor", **processor_kwargs
        )
-        processor.save_pretrained(self.tmpdirname)
+        processor.save_pretrained(cls.tmpdirname)
        cls.image_token = processor.image_token
    def get_processor(self):
        return self.processor_class.from_pretrained(self.tmpdirname)
-    def tearDown(self):
+    @staticmethod
-        shutil.rmtree(self.tmpdirname)
+    def prepare_processor_dict():
        return {
            "chat_template": "{%- set today = strftime_now(\"%Y-%m-%d\") %}\n{%- set default_system_message = \"You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.\\nYour knowledge base was last updated on 2023-10-01. The current date is \" + today + \".\\n\\nWhen you're not sure about some information, you say that you don't have the information and don't make up anything.\\nIf the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. \\\"What are some good restaurants around me?\\\" => \\\"Where are you?\\\" or \\\"When is the next flight to Tokyo\\\" => \\\"Where do you travel from?\\\")\" %}\n\n{{- bos_token }}\n\n{%- if messages[0]['role'] == 'system' %}\n    {%- if messages[0] is string %}\n        {%- set system_message = messages[0]['content'] %}\n        {%- set loop_messages = messages[1:] %}\n    {%- else %} \n        {%- set system_message = messages[0]['content'][0]['text'] %}\n        {%- set loop_messages = messages[1:] %}\n    {%- endif %}\n{%- else %}\n    {%- set system_message = default_system_message %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}\n\n{%- for message in loop_messages %}\n    {%- if message['role'] == 'user' %}\n            {%- if message['content'] is string %}\n            {{- '[INST]' + message['content'] + '[/INST]' }}\n            {%- else %}\n                    {{- '[INST]' }}\n                    {%- for block in message['content'] %}\n                            {%- if block['type'] == 'text' %}\n                                    {{- block['text'] }}\n                            {%- elif block['type'] == 'image' or block['type'] == 'image_url' %}\n                                    {{- '[IMG]' }}\n                                {%- else %}\n                                    {{- raise_exception('Only text and image blocks are supported in message content!') }}\n                                {%- endif %}\n                        {%- endfor %}\n                    {{- '[/INST]' }}\n                {%- endif %}\n    {%- elif message['role'] == 'system' %}\n        {{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }}\n    {%- elif message['role'] == 'assistant' %}\n        {%- if message['content'] is string %}\n            {{- message['content'] + eos_token }}\n        {%- else %}\n            {{- message['content'][0]['text'] + eos_token }}\n        {%- endif %}\n    {%- else %}\n        {{- raise_exception('Only user, system and assistant roles are supported!') }}\n    {%- endif %}\n{%- endfor %}",
            "patch_size": 128,
        }  # fmt: skip
    def test_image_token_filling(self):
        processor = self.processor_class.from_pretrained(self.tmpdirname)
        # Important to check with non square image
        image = torch.randint(0, 2, (3, 500, 316))
-        expected_image_tokens = 198
+        expected_image_tokens = 4
        image_token_index = 10
        messages = [
            {
                "role": "system",
-                "content": "",
+                "content": [{"type": "text", "text": "You are a helpful assistant."}],
            },
            {
                "role": "user",
@@ -104,14 +110,14 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertTrue(len(inputs_image["input_ids"]) == 1)
        self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
        self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
-        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 30]))
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 36]))
        # fmt: off
        input_ids = inputs_image["input_ids"]
        self.assertEqual(
            input_ids[0].tolist(),
            # Equivalent to "USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the content of the image? ASSISTANT:"
-            [1, 21510,  1058,  1032,    10,    10,    12,    10,    10,    13,  1010, 7493,  1681,  1278,  4701,  1307,  1278,  3937,  1063,  1349,  4290, 16002, 41150,  1058]
+            [1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 4701, 1307, 1278, 3937, 1063, 1349, 4290, 16002, 41150, 1058]
        )
        # fmt: on
@@ -121,36 +127,36 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertTrue(len(inputs_url["input_ids"]) == 1)
        self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
        self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
-        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 30]))
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 36]))
        # fmt: off
        input_ids = inputs_url["input_ids"]
        self.assertEqual(
            input_ids[0].tolist(),
            # Equivalent to "USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the content of the image? ASSISTANT:"
-            [1, 21510,  1058,  1032,    10,    10,    12,    10,    10,    13,  1010, 7493,  1681,  1278,  4701,  1307,  1278,  3937,  1063,  1349,  4290, 16002, 41150,  1058]
+            [1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 4701, 1307, 1278, 3937, 1063, 1349, 4290, 16002, 41150, 1058]
        )
        # fmt: on
        # Test passing inputs as a single list
        inputs_image = processor(text=prompt_string, images=[self.image_0], return_tensors="pt")
-        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 30]))
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 36]))
        # fmt: off
        self.assertEqual(
            inputs_image["input_ids"][0].tolist(),
-            [1, 21510,  1058,  1032,    10,    10,    12,    10,    10,    13,  1010, 7493,  1681,  1278,  4701,  1307,  1278,  3937,  1063,  1349,  4290, 16002, 41150,  1058]
+            [1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 4701, 1307, 1278, 3937, 1063, 1349, 4290, 16002, 41150, 1058]
        )
        # fmt: on
        # Test as nested single list
        inputs_image = processor(text=prompt_string, images=[[self.image_0]], return_tensors="pt")
-        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 30]))
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 36]))
        # fmt: off
        self.assertEqual(
            inputs_image["input_ids"][0].tolist(),
-            [1, 21510,  1058,  1032,    10,    10,    12,    10,    10,    13,  1010, 7493,  1681,  1278,  4701,  1307,  1278,  3937,  1063,  1349,  4290, 16002, 41150,  1058]
+            [1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 4701, 1307, 1278, 3937, 1063, 1349, 4290, 16002, 41150, 1058]
        )
        # fmt: on
@@ -168,14 +174,14 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertTrue(len(inputs_image["input_ids"]) == 1)
        self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
        self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
-        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([2, 3, 24, 30]))
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([2, 3, 24, 36]))
        # fmt: off
        input_ids = inputs_image["input_ids"]
        self.assertEqual(
            input_ids[0].tolist(),
            # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
-            [1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
+            [1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
                    )
        # fmt: on
@@ -185,25 +191,25 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertTrue(len(inputs_url["input_ids"]) == 1)
        self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
        self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
-        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([2, 3, 24, 30]))
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([2, 3, 24, 36]))
        # fmt: off
        input_ids = inputs_url["input_ids"]
        self.assertEqual(
            input_ids[0].tolist(),
            # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
-            [1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
+            [1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
        )
        # fmt: on
        # Test passing in as a nested list
        inputs_url = processor(text=prompt_string, images=[[self.image_0, self.image_1]], return_tensors="pt")
-        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([2, 3, 24, 30]))
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([2, 3, 24, 36]))
        # fmt: off
        self.assertEqual(
            inputs_url["input_ids"][0].tolist(),
-            [1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
+            [1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
        )
        # fmt: on
@@ -226,14 +232,14 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertTrue(len(inputs_image["input_ids"]) == 2)
        self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
        self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
-        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([3, 3, 30, 30]))
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([3, 3, 36, 36]))
        # fmt: off
        input_ids = inputs_image["input_ids"]
        self.assertEqual(
            input_ids[0].tolist(),
            # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
-            [1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
+            [1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
        )
        # fmt: on
@@ -243,14 +249,14 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertTrue(len(inputs_url["input_ids"]) == 2)
        self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
        self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
-        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([3, 3, 30, 30]))
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([3, 3, 36, 36]))
        # fmt: off
        input_ids = inputs_url["input_ids"]
        self.assertEqual(
            input_ids[0].tolist(),
            # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
-            [1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
+             [1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
        )
        # fmt: on
@@ -258,12 +264,12 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        inputs_image = processor(
            text=prompt_string, images=[self.image_0, self.image_1, self.image_2], return_tensors="pt", padding=True
        )
-        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([3, 3, 30, 30]))
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([3, 3, 36, 36]))
        # fmt: off
        self.assertEqual(
            inputs_image["input_ids"][0].tolist(),
-            [1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
+            [1, 21510, 1058, 1032, 10, 10, 10, 12, 10, 10, 10, 13, 10, 10, 10, 12, 10, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
        )
        # fmt: on
--- a/tests/models/mllama/test_processor_mllama.py
+++ b/tests/models/mllama/test_processor_mllama.py
@@ -16,7 +16,6 @@ import json
 import shutil
 import tempfile
 import unittest
 from typing import Optional
 import numpy as np
@@ -333,20 +332,6 @@ class MllamaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        with self.assertRaises(ValueError):
            processor(text=text, images=None, padding=True)
    # Override as MllamaProcessor needs image tokens in prompts
    def prepare_text_inputs(self, batch_size: Optional[int] = None):
        if batch_size is None:
            return "lower newer <|image|>"
        if batch_size < 1:
            raise ValueError("batch_size must be greater than 0")
        if batch_size == 1:
            return ["lower newer <|image|>"]
        return ["lower newer <|image|>", "<|image|> upper older longer string"] + ["<|image|> lower newer"] * (
            batch_size - 2
        )
    def test_unstructured_kwargs_batched(self):
        # Overriden because Mllama expects images in nested format. For 2 images it can't infer
        # the correct nesting, so we better throw an error
@@ -357,7 +342,7 @@ class MllamaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        processor = self.processor_class(**processor_components, **processor_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = self.prepare_text_inputs(batch_size=2)
+        input_str = self.prepare_text_inputs(batch_size=2, modality="image")
        image_input = self.prepare_image_inputs(batch_size=2)
        image_input = [[image_input[0]], [image_input[1]]]
        inputs = processor(
--- a/tests/models/paligemma/test_processor_paligemma.py
+++ b/tests/models/paligemma/test_processor_paligemma.py
@@ -37,10 +37,11 @@ class PaliGemmaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def setUpClass(cls):
        cls.tmpdirname = tempfile.mkdtemp()
        image_processor = SiglipImageProcessor.from_pretrained("google/siglip-so400m-patch14-384")
-        image_processor.image_seq_length = 0
+        image_processor.image_seq_length = 0  # TODO: raushan fix me in #37342
        tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True)
        processor = PaliGemmaProcessor(image_processor=image_processor, tokenizer=tokenizer)
        processor.save_pretrained(cls.tmpdirname)
        cls.image_token = processor.image_token
    @classmethod
    def tearDownClass(cls):
--- a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
+++ b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
@@ -43,8 +43,11 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.tmpdirname = tempfile.mkdtemp()
-        processor = Qwen2_5_VLProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", patch_size=4)
+        processor = Qwen2_5_VLProcessor.from_pretrained(
            "Qwen/Qwen2-VL-7B-Instruct", patch_size=4, max_pixels=56 * 56, min_pixels=28 * 28
        )
        processor.save_pretrained(cls.tmpdirname)
        cls.image_token = processor.image_token
    def get_tokenizer(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
@@ -52,8 +55,11 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def get_image_processor(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
-    def prepare_processor_dict(self):
+    @staticmethod
-        return {"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"}  # fmt: skip
+    def prepare_processor_dict():
        return {
            "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
        }  # fmt: skip
    @classmethod
    def tearDownClass(cls):
@@ -206,7 +212,7 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertTrue(input_name in out_dict)
        self.assertEqual(len(out_dict["input_ids"]), batch_size)
        self.assertEqual(len(out_dict["attention_mask"]), batch_size)
-        self.assertEqual(len(out_dict[input_name]), batch_size * 19200)
+        self.assertEqual(len(out_dict[input_name]), batch_size * 192)
        return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list}
        for k in out_dict:
@@ -261,7 +267,7 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            num_frames=num_frames,
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 115200)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 360)
        # Load with `video_fps` arg
        video_fps = 1
@@ -273,7 +279,7 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            video_fps=video_fps,
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 288000)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 900)
        # Load with `video_fps` and `num_frames` args, should raise an error
        with self.assertRaises(ValueError):
@@ -294,7 +300,7 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            return_dict=True,
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 8640000)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 27000)
        # Load video as a list of frames (i.e. images). NOTE: each frame should have same size
        # because we assume they come from one video
@@ -312,7 +318,7 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            return_dict=True,
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 71280)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 160)
    def test_kwargs_overrides_custom_image_processor_kwargs(self):
        processor_components = self.prepare_components()
@@ -328,7 +334,7 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        inputs = processor(text=input_str, images=image_input, max_pixels=56 * 56 * 4, return_tensors="pt")
        self.assertEqual(inputs[self.images_input_name].shape[0], 612)
        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
-        self.assertEqual(inputs[self.images_input_name].shape[0], 800)
+        self.assertEqual(inputs[self.images_input_name].shape[0], 100)
    @require_av
    def test_apply_chat_template_video_special_processing(self):
@@ -395,4 +401,4 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        # Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
        formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
        self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1756800)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 21960)
--- a/tests/models/qwen2_audio/test_processor_qwen2_audio.py
+++ b/tests/models/qwen2_audio/test_processor_qwen2_audio.py
@@ -14,7 +14,6 @@
 import shutil
 import tempfile
 import unittest
 from typing import Optional
 from transformers import AutoProcessor, AutoTokenizer, Qwen2AudioProcessor, WhisperFeatureExtractor
 from transformers.testing_utils import require_torch, require_torchaudio
@@ -40,6 +39,7 @@ class Qwen2AudioProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        processor_kwargs = cls.prepare_processor_dict()
        processor = Qwen2AudioProcessor.from_pretrained(cls.checkpoint, **processor_kwargs)
        processor.save_pretrained(cls.tmpdirname)
        cls.audio_token = processor.audio_token
    def get_tokenizer(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
@@ -57,20 +57,6 @@ class Qwen2AudioProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            "chat_template": "{% set audio_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if 'audio' in content or 'audio_url' in content or content['type'] == 'audio' %}{% set audio_count.value = audio_count.value + 1 %}Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\n{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
        }
    # Override as Qwen2AudioProcessor needs audio tokens in prompts
    def prepare_text_inputs(self, batch_size: Optional[int] = None):
        if batch_size is None:
            return "lower newer <|AUDIO|>"
        if batch_size < 1:
            raise ValueError("batch_size must be greater than 0")
        if batch_size == 1:
            return ["lower newer <|AUDIO|>"]
        return ["lower newer <|AUDIO|>", "<|AUDIO|> upper older longer string"] + ["<|AUDIO|> lower newer"] * (
            batch_size - 2
        )
    def test_can_load_various_tokenizers(self):
        processor = Qwen2AudioProcessor.from_pretrained(self.checkpoint)
        tokenizer = AutoTokenizer.from_pretrained(self.checkpoint)
--- a/tests/models/qwen2_vl/test_processor_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_processor_qwen2_vl.py
@@ -43,8 +43,11 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.tmpdirname = tempfile.mkdtemp()
-        processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", patch_size=4)
+        processor = Qwen2VLProcessor.from_pretrained(
            "Qwen/Qwen2-VL-7B-Instruct", patch_size=4, max_pixels=56 * 56, min_pixels=28 * 28
        )
        processor.save_pretrained(cls.tmpdirname)
        cls.image_token = processor.image_token
    def get_tokenizer(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
@@ -52,7 +55,8 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def get_image_processor(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
-    def prepare_processor_dict(self):
+    @staticmethod
    def prepare_processor_dict():
        return {"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"}  # fmt: skip
    @classmethod
@@ -203,7 +207,7 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertTrue(input_name in out_dict)
        self.assertEqual(len(out_dict["input_ids"]), batch_size)
        self.assertEqual(len(out_dict["attention_mask"]), batch_size)
-        self.assertEqual(len(out_dict[input_name]), batch_size * 19200)
+        self.assertEqual(len(out_dict[input_name]), batch_size * 192)
        return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list}
        for k in out_dict:
@@ -258,7 +262,7 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            num_frames=num_frames,
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 115200)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 360)
        # Load with `video_fps` arg
        video_fps = 1
@@ -270,7 +274,7 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            video_fps=video_fps,
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 288000)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 900)
        # Load with `video_fps` and `num_frames` args, should raise an error
        with self.assertRaises(ValueError):
@@ -291,7 +295,7 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            return_dict=True,
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 8640000)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 27000)
        # Load video as a list of frames (i.e. images). NOTE: each frame should have same size
        # because we assume they come from one video
@@ -309,7 +313,7 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            return_dict=True,
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 71280)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 160)
    @require_av
    def test_apply_chat_template_video_special_processing(self):
@@ -376,7 +380,7 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        # Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
        formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
        self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1756800)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 21960)
    def test_kwargs_overrides_custom_image_processor_kwargs(self):
        processor_components = self.prepare_components()
@@ -390,6 +394,6 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
-        self.assertEqual(inputs[self.images_input_name].shape[0], 800)
+        self.assertEqual(inputs[self.images_input_name].shape[0], 100)
        inputs = processor(text=input_str, images=image_input, max_pixels=56 * 56 * 4, return_tensors="pt")
        self.assertEqual(inputs[self.images_input_name].shape[0], 612)
--- a/tests/models/smolvlm/test_processor_smolvlm.py
+++ b/tests/models/smolvlm/test_processor_smolvlm.py
@@ -16,7 +16,6 @@ import shutil
 import tempfile
 import unittest
 from io import BytesIO
 from typing import Optional
 import numpy as np
 import requests
@@ -42,7 +41,8 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.tmpdirname = tempfile.mkdtemp()
-        processor = SmolVLMProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-256M-Video-Instruct", image_seq_len=2)
+        processor_kwargs = cls.prepare_processor_dict()
        processor = SmolVLMProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-256M-Video-Instruct", **processor_kwargs)
        processor.save_pretrained(cls.tmpdirname)
        cls.image1 = Image.open(
            BytesIO(
@@ -82,9 +82,10 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def get_processor(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
-    def prepare_processor_dict(self):
+    @staticmethod
    def prepare_processor_dict():
        return {
-            "image_seq_len": self.image_seq_len,
+            "image_seq_len": 2,
            "chat_template": "<|im_start|>{% for message in messages %}{{message['role'] | capitalize}}{% if message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif %}{% for line in message['content'] %}{% if line['type'] == 'text' %}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% endif %}{% endfor %}<end_of_utterance>\n{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
        }
@@ -426,106 +427,6 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        # NOTE: the last assert checks are removed
        # Loading video as a list of frames (i.e. images) is not supported in SmolVLM
    # Override as SmolVLMProcessor needs image tokens in prompts
    def prepare_text_inputs(self, batch_size: Optional[int] = None):
        if batch_size is None:
            return "lower newer <image>"
        if batch_size < 1:
            raise ValueError("batch_size must be greater than 0")
        if batch_size == 1:
            return ["lower newer <image>"]
        return ["lower newer <image>", "<image> upper older longer string"] + ["<image> lower newer"] * (
            batch_size - 2
        )
    # Override tests as inputs_ids padded dimension is the second one but not the last one
    @require_vision
    @require_torch
    def test_kwargs_overrides_default_tokenizer_kwargs(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
        tokenizer = self.get_component("tokenizer", max_length=30)
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=30)
        self.assertEqual(len(inputs["input_ids"][0]), 30)
    @require_torch
    @require_vision
    def test_structured_kwargs_nested(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
        tokenizer = self.get_component("tokenizer")
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        # Define the kwargs for each modality
        inputs = processor(
            text=input_str,
            images=image_input,
            common_kwargs={"return_tensors": "pt"},
            images_kwargs={"max_image_size": {"longest_edge": 32}},
            text_kwargs={"padding": "max_length", "max_length": 120, "truncation": "longest_first"},
        )
        self.skip_processor_without_typed_kwargs(processor)
        self.assertEqual(inputs["pixel_values"].shape[3], 32)
        self.assertEqual(len(inputs["input_ids"][0]), 120)
    @require_torch
    @require_vision
    def test_structured_kwargs_nested_from_dict(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
        tokenizer = self.get_component("tokenizer")
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        # Define the kwargs for each modality
        all_kwargs = {
            "common_kwargs": {"return_tensors": "pt"},
            "images_kwargs": {"max_image_size": {"longest_edge": 32}},
            "text_kwargs": {"padding": "max_length", "max_length": 120, "truncation": "longest_first"},
        }
        inputs = processor(text=input_str, images=image_input, **all_kwargs)
        self.assertEqual(inputs["pixel_values"].shape[3], 32)
        self.assertEqual(len(inputs["input_ids"][0]), 120)
    @require_vision
    @require_torch
    def test_tokenizer_defaults_preserved_by_kwargs(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
        tokenizer = self.get_component("tokenizer", max_length=30)
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
        self.assertEqual(len(inputs["input_ids"][0]), 30)
    @require_torch
    @require_vision
    def test_unstructured_kwargs_batched(self):
@@ -537,7 +438,7 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = self.prepare_text_inputs(batch_size=2)
+        input_str = self.prepare_text_inputs(batch_size=2, modality="image")
        image_input = self.prepare_image_inputs(batch_size=2)
        image_input = [[image_input[0]], [image_input[1]]]
        inputs = processor(
@@ -554,32 +455,6 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertEqual(inputs["pixel_values"].shape[3], 30)
        self.assertEqual(len(inputs["input_ids"][0]), 76)
    @require_torch
    @require_vision
    def test_unstructured_kwargs(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
        tokenizer = self.get_component("tokenizer")
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        inputs = processor(
            text=input_str,
            images=image_input,
            return_tensors="pt",
            max_image_size={"longest_edge": 32},
            padding="max_length",
            max_length=120,
            truncation="longest_first",
        )
        self.assertEqual(inputs["pixel_values"].shape[3], 32)
        self.assertEqual(len(inputs["input_ids"][0]), 120)
    @require_torch
    @require_vision
    def test_text_only_inference(self):
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -92,7 +92,8 @@ class ProcessorTesterMixin:
    videos_input_name = "pixel_values_videos"
    audio_input_name = "input_features"
-    def prepare_processor_dict(self):
+    @staticmethod
    def prepare_processor_dict():
        return {}
    def get_component(self, attribute, **kwargs):
@@ -123,18 +124,23 @@ class ProcessorTesterMixin:
        processor = self.processor_class(**components, **self.prepare_processor_dict())
        return processor
-    # TODO: raushan unify all these special token LLMs under the general preparation. We can get audio/image token
+    def prepare_text_inputs(self, batch_size: Optional[int] = None, modality: Optional[str] = None):
-    # from tokenizer, so we can generalize instead of overriding
+        if modality is not None:
-    def prepare_text_inputs(self, batch_size: Optional[int] = None):
+            special_token_to_add = getattr(self, f"{modality}_token", "")
        else:
            special_token_to_add = ""
        if batch_size is None:
-            return "lower newer"
+            return f"lower newer {special_token_to_add}"
        if batch_size < 1:
            raise ValueError("batch_size must be greater than 0")
        if batch_size == 1:
-            return ["lower newer"]
+            return [f"lower newer {special_token_to_add}"]
-        return ["lower newer", "upper older longer string"] + ["lower newer"] * (batch_size - 2)
+        return [f"lower newer {special_token_to_add}", f" {special_token_to_add} upper older longer string"] + [
            f"lower newer {special_token_to_add}"
        ] * (batch_size - 2)
    @require_vision
    def prepare_image_inputs(self, batch_size: Optional[int] = None):
@@ -159,6 +165,13 @@ class ProcessorTesterMixin:
        for key, value in self.prepare_processor_dict().items():
            # Chat template is saved as a separate file
            if key not in "chat_template":
                # json converts dict keys to str, but some processors force convert back to int when init
                if (
                    isinstance(obj[key], dict)
                    and isinstance(list(obj[key].keys())[0], str)
                    and isinstance(list(value.keys())[0], int)
                ):
                    obj[key] = {int(k): v for k, v in obj[key].items()}
                self.assertEqual(obj[key], value)
                self.assertEqual(getattr(processor, key, None), value)
@@ -206,7 +219,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(**processor_components, **processor_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = self.prepare_text_inputs()
+        input_str = self.prepare_text_inputs(modality="image")
        image_input = self.prepare_image_inputs()
        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
        self.assertEqual(inputs[self.text_input_name].shape[-1], 117)
@@ -229,7 +242,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(**processor_components, **processor_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = self.prepare_text_inputs()
+        input_str = self.prepare_text_inputs(modality="image")
        image_input = self.prepare_image_inputs()
        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
@@ -244,7 +257,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(**processor_components, **processor_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = self.prepare_text_inputs()
+        input_str = self.prepare_text_inputs(modality="image")
        image_input = self.prepare_image_inputs()
        inputs = processor(
            text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
@@ -264,7 +277,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(**processor_components, **processor_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = self.prepare_text_inputs()
+        input_str = self.prepare_text_inputs(modality="image")
        image_input = self.prepare_image_inputs()
        inputs = processor(text=input_str, images=image_input, do_rescale=True, rescale_factor=-1, return_tensors="pt")
@@ -278,7 +291,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(**processor_components, **processor_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = self.prepare_text_inputs()
+        input_str = self.prepare_text_inputs(modality="image")
        image_input = self.prepare_image_inputs()
        inputs = processor(
            text=input_str,
@@ -301,7 +314,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(**processor_components, **processor_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = self.prepare_text_inputs(batch_size=2)
+        input_str = self.prepare_text_inputs(batch_size=2, modality="image")
        image_input = self.prepare_image_inputs(batch_size=2)
        inputs = processor(
            text=input_str,
@@ -327,7 +340,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(**processor_components, **processor_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = [self.prepare_text_inputs()]
+        input_str = [self.prepare_text_inputs(modality="image")]
        image_input = self.prepare_image_inputs()
        with self.assertRaises(ValueError):
            _ = processor(
@@ -346,7 +359,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(**processor_components, **processor_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = self.prepare_text_inputs()
+        input_str = self.prepare_text_inputs(modality="image")
        image_input = self.prepare_image_inputs()
        # Define the kwargs for each modality
@@ -369,7 +382,7 @@ class ProcessorTesterMixin:
        processor_kwargs = self.prepare_processor_dict()
        processor = self.processor_class(**processor_components, **processor_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = self.prepare_text_inputs()
+        input_str = self.prepare_text_inputs(modality="image")
        image_input = self.prepare_image_inputs()
        # Define the kwargs for each modality
@@ -396,7 +409,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = self.prepare_text_inputs(batch_size=3)
+        input_str = self.prepare_text_inputs(batch_size=3, modality="audio")
        raw_speech = floats_list((3, 1000))
        raw_speech = [np.asarray(audio) for audio in raw_speech]
        inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt")
@@ -414,7 +427,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = self.prepare_text_inputs(batch_size=3)
+        input_str = self.prepare_text_inputs(batch_size=3, modality="audio")
        raw_speech = floats_list((3, 1000))
        raw_speech = [np.asarray(audio) for audio in raw_speech]
        inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt", max_length=300, padding="max_length")
@@ -433,7 +446,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = self.prepare_text_inputs(batch_size=3)
+        input_str = self.prepare_text_inputs(batch_size=3, modality="audio")
        raw_speech = floats_list((3, 1000))
        raw_speech = [np.asarray(audio) for audio in raw_speech]
        inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt", max_length=300, padding="max_length")
@@ -452,7 +465,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = self.prepare_text_inputs(batch_size=3)
+        input_str = self.prepare_text_inputs(batch_size=3, modality="audio")
        raw_speech = floats_list((3, 1000))
        raw_speech = [np.asarray(audio) for audio in raw_speech]
        with self.assertRaises(ValueError):
@@ -476,7 +489,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = self.prepare_text_inputs(batch_size=3)
+        input_str = self.prepare_text_inputs(batch_size=3, modality="audio")
        raw_speech = floats_list((3, 1000))
        raw_speech = [np.asarray(audio) for audio in raw_speech]
@@ -499,7 +512,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(**processor_components, **processor_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = self.prepare_text_inputs()
+        input_str = self.prepare_text_inputs(modality="video")
        video_input = self.prepare_video_inputs()
        inputs = processor(text=input_str, videos=video_input, return_tensors="pt")
        self.assertEqual(inputs[self.text_input_name].shape[-1], 117)
@@ -522,7 +535,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(**processor_components, **processor_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = self.prepare_text_inputs()
+        input_str = self.prepare_text_inputs(modality="video")
        video_input = self.prepare_video_inputs()
        inputs = processor(text=input_str, videos=video_input, return_tensors="pt")
@@ -537,7 +550,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(**processor_components, **processor_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = self.prepare_text_inputs()
+        input_str = self.prepare_text_inputs(modality="video")
        video_input = self.prepare_video_inputs()
        inputs = processor(
            text=input_str, videos=video_input, return_tensors="pt", max_length=112, padding="max_length"
@@ -557,7 +570,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(**processor_components, **processor_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = self.prepare_text_inputs()
+        input_str = self.prepare_text_inputs(modality="video")
        video_input = self.prepare_video_inputs()
        inputs = processor(text=input_str, videos=video_input, do_rescale=True, rescale_factor=-1, return_tensors="pt")
@@ -571,7 +584,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(**processor_components, **processor_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = self.prepare_text_inputs()
+        input_str = self.prepare_text_inputs(modality="video")
        video_input = self.prepare_video_inputs()
        inputs = processor(
            text=input_str,
@@ -594,7 +607,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(**processor_components, **processor_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = self.prepare_text_inputs(batch_size=2)
+        input_str = self.prepare_text_inputs(batch_size=2, modality="video")
        video_input = self.prepare_video_inputs(batch_size=2)
        inputs = processor(
            text=input_str,
@@ -620,7 +633,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(**processor_components, **processor_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = [self.prepare_text_inputs()]
+        input_str = [self.prepare_text_inputs(modality="video")]
        video_input = self.prepare_video_inputs()
        with self.assertRaises(ValueError):
            _ = processor(
@@ -639,7 +652,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(**processor_components, **processor_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = self.prepare_text_inputs()
+        input_str = self.prepare_text_inputs(modality="video")
        video_input = self.prepare_video_inputs()
        # Define the kwargs for each modality
@@ -662,7 +675,7 @@ class ProcessorTesterMixin:
        processor_kwargs = self.prepare_processor_dict()
        processor = self.processor_class(**processor_components, **processor_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = self.prepare_text_inputs()
+        input_str = self.prepare_text_inputs(modality="video")
        video_input = self.prepare_video_inputs()
        # Define the kwargs for each modality
@@ -686,7 +699,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(**processor_components)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = self.prepare_text_inputs()
+        input_str = self.prepare_text_inputs(modality="image")
        image_input = self.prepare_image_inputs()
        with self.assertRaises(ValueError):
@@ -713,7 +726,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = self.prepare_text_inputs(batch_size=3)
+        input_str = self.prepare_text_inputs(batch_size=3, modality="audio")
        audio_lengths = [4000, 8000, 16000, 32000]
        raw_speech = [np.asarray(audio)[:length] for audio, length in zip(floats_list((3, 32_000)), audio_lengths)]