Add support for custom inputs and batched inputs in ProcessorTesterMixin (#33711)

* add support for custom inputs and batched inputs in ProcessorTesterMixin * Fix batch_size behavior ProcessorTesterMixin * Change format prepare inputs batched * Remove override test pixtral processor * Remove unnecessary tests and cleanup after new prepare_inputs functions * Fix instructBlipVideo image processor
2024-10-01 23:52:03 +02:00
parent 1baa08897d
commit 61ac161a9d
8 changed files with 95 additions and 269 deletions
--- a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
@@ -57,7 +57,10 @@ def make_batched_videos(videos) -> List[VideoInput]:
        elif len(videos[0].shape) == 4:
            return [list(video) for video in videos]

-    elif is_valid_image(videos) and len(videos.shape) == 4:
+    elif is_valid_image(videos):
+        if isinstance(videos, PIL.Image.Image):
+            return [[videos]]
+        elif len(videos.shape) == 4:
            return [list(videos)]

    raise ValueError(f"Could not make batched video from {videos}")
--- a/tests/models/fuyu/test_processing_fuyu.py
+++ b/tests/models/fuyu/test_processing_fuyu.py
@@ -190,7 +190,7 @@ class FuyuProcessingTest(ProcessorTesterMixin, unittest.TestCase):

        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
        # Fuyu uses tokenizer kwargs only when image is None.
        image_input = None

@@ -218,7 +218,7 @@ class FuyuProcessingTest(ProcessorTesterMixin, unittest.TestCase):

        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
        # Fuyu uses tokenizer kwargs only when image is None.
        image_input = None

@@ -237,7 +237,7 @@ class FuyuProcessingTest(ProcessorTesterMixin, unittest.TestCase):
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)

-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
        # Fuyu uses tokenizer kwargs only when image is None.
        image_input = None

@@ -264,7 +264,7 @@ class FuyuProcessingTest(ProcessorTesterMixin, unittest.TestCase):

        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
        # Fuyu uses tokenizer kwargs only when image is None.
        image_input = None

@@ -290,7 +290,7 @@ class FuyuProcessingTest(ProcessorTesterMixin, unittest.TestCase):
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)

-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
        # Fuyu uses tokenizer kwargs only when image is None.
        image_input = None
        inputs = processor(
@@ -315,7 +315,7 @@ class FuyuProcessingTest(ProcessorTesterMixin, unittest.TestCase):
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)

-        input_str = ["lower newer", "upper older longer string"]
+        input_str = self.prepare_text_inputs(batch_size=2)
        # Fuyu uses tokenizer kwargs only when image is None.
        image_input = None
        inputs = processor(
--- a/tests/models/idefics3/test_processing_idefics3.py
+++ b/tests/models/idefics3/test_processing_idefics3.py
@@ -17,6 +17,7 @@ import shutil
 import tempfile
 import unittest
 from io import BytesIO
+from typing import Optional

 import numpy as np
 import requests
@@ -284,44 +285,29 @@ class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        )
        self.assertEqual(rendered, expected_rendered)

-    @require_torch
-    @require_vision
-    def test_image_processor_defaults_preserved_by_image_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer", max_length=117)
+    # Override as Idefics3Processor needs image tokens in prompts
+    def prepare_text_inputs(self, batch_size: Optional[int] = None):
+        if batch_size is None:
+            return "lower newer <image>"

-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
+        if batch_size < 1:
+            raise ValueError("batch_size must be greater than 0")

-        input_str = "lower newer <image>"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-        self.assertEqual(len(inputs["pixel_values"][0][0]), 3)
-        self.assertEqual(len(inputs["pixel_values"][0][0][0]), 364)  # crop size doesn't affect our image processor
-
-    @require_torch
-    @require_vision
-    def test_kwargs_overrides_default_image_processor_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component(
-            "image_processor", max_image_size={"longest_edge": 32}, size={"longest_edge": 32}
+        if batch_size == 1:
+            return ["lower newer <image>"]
+        return ["lower newer <image>", "<image> upper older longer string"] + ["<image> lower newer"] * (
+            batch_size - 2
        )
-        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")

-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor, image_seq_len=2)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer <image>"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-        self.assertEqual(len(inputs["pixel_values"][0][0]), 3)
-        self.assertEqual(len(inputs["pixel_values"][0][0][0]), 32)
-        self.assertEqual(len(inputs["input_ids"][0]), 117)
+    # Override as Idefics3Processor needs nested images to work properly with batched inputs
+    @require_vision
+    def prepare_image_inputs(self, batch_size: Optional[int] = None):
+        """This function prepares a list of PIL images for testing"""
+        if batch_size is None:
+            return super().prepare_image_inputs()
+        if batch_size < 1:
+            raise ValueError("batch_size must be greater than 0")
+        return [[super().prepare_image_inputs()]] * batch_size

    @require_vision
    @require_torch
@@ -333,7 +319,7 @@ class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):

        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer<image>"
+        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()

        inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=30)
@@ -350,7 +336,7 @@ class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)

-        input_str = "lower newer<image>"
+        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()

        # Define the kwargs for each modality
@@ -378,7 +364,7 @@ class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):

        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer<image>"
+        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()

        # Define the kwargs for each modality
@@ -402,7 +388,7 @@ class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):

        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer<image>"
+        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()

        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
@@ -419,11 +405,11 @@ class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)

-        input_str = ["<image>lower newer", "<image>upper older longer string"]
-        image_input = self.prepare_image_inputs()
+        input_str = self.prepare_text_inputs(batch_size=2)
+        image_input = self.prepare_image_inputs(batch_size=2)
        inputs = processor(
            text=input_str,
-            images=[image_input, image_input],
+            images=image_input,
            return_tensors="pt",
            padding="longest",
            max_length=76,
@@ -446,7 +432,7 @@ class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)

-        input_str = "lower newer<image>"
+        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        inputs = processor(
            text=input_str,
--- a/tests/models/kosmos2/test_processor_kosmos2.py
+++ b/tests/models/kosmos2/test_processor_kosmos2.py
@@ -499,7 +499,7 @@ class Kosmos2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):

        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
        # set image input to None
        image_input = None

@@ -525,7 +525,7 @@ class Kosmos2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)

-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()

        # Define the kwargs for each modality
@@ -551,7 +551,7 @@ class Kosmos2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):

        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()

        # Define the kwargs for each modality
@@ -574,7 +574,7 @@ class Kosmos2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):

        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
        # set image input to None
        image_input = None

@@ -593,7 +593,7 @@ class Kosmos2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)

-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
        # set image input to None
        image_input = None
        inputs = processor(
@@ -618,7 +618,7 @@ class Kosmos2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)

-        input_str = ["lower newer", "upper older longer string"]
+        input_str = self.prepare_text_inputs(batch_size=2)
        # set image input to None
        image_input = None
        inputs = processor(
--- a/tests/models/omdet_turbo/test_processor_omdet_turbo.py
+++ b/tests/models/omdet_turbo/test_processor_omdet_turbo.py
@@ -17,7 +17,6 @@ import shutil
 import tempfile
 import unittest

-import numpy as np
 import pytest

 from transformers import AutoProcessor, CLIPTokenizerFast, OmDetTurboProcessor
@@ -36,8 +35,6 @@ if is_torch_available():
    from transformers.models.omdet_turbo.modeling_omdet_turbo import OmDetTurboObjectDetectionOutput

 if is_vision_available():
-    from PIL import Image
-
    from transformers import DetrImageProcessor


@@ -45,6 +42,7 @@ if is_vision_available():
@require_vision
 class OmDetTurboProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    processor_class = OmDetTurboProcessor
+    text_input_name = "classes_input_ids"

    def setUp(self):
        self.tmpdirname = tempfile.mkdtemp()
@@ -77,17 +75,6 @@ class OmDetTurboProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def tearDown(self):
        shutil.rmtree(self.tmpdirname)

-    def prepare_image_inputs(self):
-        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
-        or a list of PyTorch tensors if one specifies torchify=True.
-        """
-
-        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
-
-        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
-
-        return image_inputs
-
    def get_fake_omdet_turbo_output(self):
        torch.manual_seed(42)
        return OmDetTurboObjectDetectionOutput(
@@ -210,154 +197,3 @@ class OmDetTurboProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        inputs = processor(images=image_input, text=input_classes, task=input_tasks, return_tensors="pt")

        self.assertListEqual(list(inputs.keys()), self.input_keys)
-
-    @require_vision
-    @require_torch
-    def test_tokenizer_defaults_preserved_by_kwargs(self):
-        # Rewrite as OmDet-Turbo processor outputs "input_ids" for both tasks and classes.
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer", max_length=117)
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-        inputs = processor(images=image_input, text=[input_str], task=input_str, return_tensors="pt")
-
-        self.assertEqual(len(inputs["tasks_input_ids"][0]), 117)
-        self.assertEqual(len(inputs["classes_input_ids"][0]), 117)
-
-    @require_vision
-    @require_torch
-    def test_kwargs_overrides_default_tokenizer_kwargs(self):
-        # Rewrite as OmDet-Turbo processor outputs "input_ids" for both tasks and classes.
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer", max_length=117)
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-        inputs = processor(images=image_input, text=[input_str], task=input_str, return_tensors="pt", max_length=112)
-
-        self.assertEqual(len(inputs["tasks_input_ids"][0]), 112)
-        self.assertEqual(len(inputs["classes_input_ids"][0]), 112)
-
-    @require_torch
-    @require_vision
-    def test_unstructured_kwargs(self):
-        # Rewrite as OmDet-Turbo processor outputs "input_ids" for both tasks and classes.
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-        inputs = processor(
-            images=image_input,
-            text=[input_str],
-            task=input_str,
-            return_tensors="pt",
-            size={"height": 214, "width": 214},
-            padding="max_length",
-            max_length=76,
-        )
-
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
-        self.assertEqual(len(inputs["tasks_input_ids"][0]), 76)
-        self.assertEqual(len(inputs["classes_input_ids"][0]), 76)
-
-    @require_torch
-    @require_vision
-    def test_unstructured_kwargs_batched(self):
-        # Rewrite as OmDet-Turbo processor outputs "input_ids" for both tasks and classes.
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = ["lower newer", "upper older longer string"]
-        image_input = self.prepare_image_inputs() * 2
-        inputs = processor(
-            images=image_input,
-            text=[input_str],
-            task=input_str,
-            return_tensors="pt",
-            size={"height": 214, "width": 214},
-            padding="longest",
-            max_length=76,
-        )
-
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
-
-        self.assertEqual(len(inputs["tasks_input_ids"][0]), 6)
-        self.assertEqual(len(inputs["classes_input_ids"][0]), 6)
-
-    @require_torch
-    @require_vision
-    def test_structured_kwargs_nested(self):
-        # Rewrite as OmDet-Turbo processor outputs "input_ids" for both tasks and classes.
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        # Define the kwargs for each modality
-        all_kwargs = {
-            "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"size": {"height": 214, "width": 214}},
-            "text_kwargs": {"padding": "max_length", "max_length": 76, "task": input_str},
-        }
-
-        inputs = processor(images=image_input, text=[input_str], **all_kwargs)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
-
-        self.assertEqual(len(inputs["tasks_input_ids"][0]), 76)
-        self.assertEqual(len(inputs["classes_input_ids"][0]), 76)
-
-    @require_torch
-    @require_vision
-    def test_structured_kwargs_nested_from_dict(self):
-        # Rewrite as OmDet-Turbo processor outputs "input_ids" for both tasks and classes.
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        # Define the kwargs for each modality
-        all_kwargs = {
-            "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"size": {"height": 214, "width": 214}},
-            "text_kwargs": {"padding": "max_length", "max_length": 76, "task": input_str},
-        }
-
-        inputs = processor(images=image_input, text=[input_str], **all_kwargs)
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
-
-        self.assertEqual(len(inputs["tasks_input_ids"][0]), 76)
-        self.assertEqual(len(inputs["classes_input_ids"][0]), 76)
--- a/tests/models/pix2struct/test_processor_pix2struct.py
+++ b/tests/models/pix2struct/test_processor_pix2struct.py
@@ -96,7 +96,7 @@ class Pix2StructProcessorTest(ProcessorTesterMixin, unittest.TestCase):

        processor = Pix2StructProcessor(tokenizer=tokenizer, image_processor=image_processor)

-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()

        encoded_processor = processor(text=input_str)

@@ -111,7 +111,7 @@ class Pix2StructProcessorTest(ProcessorTesterMixin, unittest.TestCase):

        processor = Pix2StructProcessor(tokenizer=tokenizer, image_processor=image_processor)

-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()

        inputs = processor(text=input_str, images=image_input)
@@ -130,7 +130,7 @@ class Pix2StructProcessorTest(ProcessorTesterMixin, unittest.TestCase):

        processor = Pix2StructProcessor(tokenizer=tokenizer, image_processor=image_processor)

-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()

        inputs = processor(text=input_str, images=image_input)
@@ -168,7 +168,7 @@ class Pix2StructProcessorTest(ProcessorTesterMixin, unittest.TestCase):

        processor = Pix2StructProcessor(tokenizer=tokenizer, image_processor=image_processor)

-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()

        inputs = processor(text=input_str, images=image_input)
@@ -195,7 +195,7 @@ class Pix2StructProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)

-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()

        inputs = processor(text=input_str, images=image_input)
@@ -213,7 +213,7 @@ class Pix2StructProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)

-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()

        inputs = processor(text=input_str, images=image_input, max_patches=1024)
@@ -231,7 +231,7 @@ class Pix2StructProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)

-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        inputs = processor(
            text=input_str,
@@ -257,8 +257,8 @@ class Pix2StructProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)

-        input_str = ["lower newer", "upper older longer string"]
-        image_input = self.prepare_image_inputs() * 2
+        input_str = self.prepare_text_inputs(batch_size=2)
+        image_input = self.prepare_image_inputs(batch_size=2)
        inputs = processor(
            text=input_str,
            images=image_input,
@@ -284,7 +284,7 @@ class Pix2StructProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)

-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()

        # Define the kwargs for each modality
@@ -313,7 +313,7 @@ class Pix2StructProcessorTest(ProcessorTesterMixin, unittest.TestCase):

        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()

        # Define the kwargs for each modality
--- a/tests/models/pixtral/test_processor_pixtral.py
+++ b/tests/models/pixtral/test_processor_pixtral.py
@@ -14,6 +14,7 @@
 import shutil
 import tempfile
 import unittest
+from typing import Optional

 import requests
 import torch
@@ -246,27 +247,11 @@ class PixtralProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        # fmt: on

    # Override as PixtralProcessor needs nested images to work properly with batched inputs
-    def test_unstructured_kwargs_batched(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        processor_components = self.prepare_components()
-        processor = self.processor_class(**processor_components)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = ["lower newer", "upper older longer string"]
-        image_input = [self.prepare_image_inputs()] * 2
-        inputs = processor(
-            text=input_str,
-            images=image_input,
-            return_tensors="pt",
-            do_rescale=True,
-            rescale_factor=-1,
-            padding="longest",
-            max_length=76,
-        )
-
-        self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
-        self.assertTrue(
-            len(inputs[self.text_input_name][0]) == len(inputs[self.text_input_name][1])
-            and len(inputs[self.text_input_name][1]) < 76
-        )
+    @require_vision
+    def prepare_image_inputs(self, batch_size: Optional[int] = None):
+        """This function prepares a list of PIL images for testing"""
+        if batch_size is None:
+            return super().prepare_image_inputs()
+        if batch_size < 1:
+            raise ValueError("batch_size must be greater than 0")
+        return [[super().prepare_image_inputs()]] * batch_size
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -17,6 +17,7 @@
 import inspect
 import json
 import tempfile
+from typing import Optional

 import numpy as np

@@ -86,10 +87,25 @@ class ProcessorTesterMixin:
        processor = self.processor_class(**components, **self.prepare_processor_dict())
        return processor

+    def prepare_text_inputs(self, batch_size: Optional[int] = None):
+        if batch_size is None:
+            return "lower newer"
+
+        if batch_size < 1:
+            raise ValueError("batch_size must be greater than 0")
+
+        if batch_size == 1:
+            return ["lower newer"]
+        return ["lower newer", "upper older longer string"] + ["lower newer"] * (batch_size - 2)
+
    @require_vision
-    def prepare_image_inputs(self):
+    def prepare_image_inputs(self, batch_size: Optional[int] = None):
        """This function prepares a list of PIL images for testing"""
-        return prepare_image_inputs()
+        if batch_size is None:
+            return prepare_image_inputs()[0]
+        if batch_size < 1:
+            raise ValueError("batch_size must be greater than 0")
+        return prepare_image_inputs() * batch_size

    @require_vision
    def prepare_video_inputs(self):
@@ -148,7 +164,7 @@ class ProcessorTesterMixin:

        processor = self.processor_class(**processor_components)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
        self.assertEqual(inputs[self.text_input_name].shape[-1], 117)
@@ -170,7 +186,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(**processor_components)
        self.skip_processor_without_typed_kwargs(processor)

-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()

        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
@@ -184,7 +200,7 @@ class ProcessorTesterMixin:

        processor = self.processor_class(**processor_components)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        inputs = processor(
            text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
@@ -203,7 +219,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(**processor_components)
        self.skip_processor_without_typed_kwargs(processor)

-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()

        inputs = processor(text=input_str, images=image_input, do_rescale=True, rescale_factor=-1, return_tensors="pt")
@@ -216,7 +232,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(**processor_components)
        self.skip_processor_without_typed_kwargs(processor)

-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()
        inputs = processor(
            text=input_str,
@@ -238,8 +254,8 @@ class ProcessorTesterMixin:
        processor = self.processor_class(**processor_components)
        self.skip_processor_without_typed_kwargs(processor)

-        input_str = ["lower newer", "upper older longer string"]
-        image_input = self.prepare_image_inputs() * 2
+        input_str = self.prepare_text_inputs(batch_size=2)
+        image_input = self.prepare_image_inputs(batch_size=2)
        inputs = processor(
            text=input_str,
            images=image_input,
@@ -263,7 +279,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(**processor_components)
        self.skip_processor_without_typed_kwargs(processor)

-        input_str = ["lower newer"]
+        input_str = [self.prepare_text_inputs()]
        image_input = self.prepare_image_inputs()
        with self.assertRaises(ValueError):
            _ = processor(
@@ -281,7 +297,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(**processor_components)
        self.skip_processor_without_typed_kwargs(processor)

-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()

        # Define the kwargs for each modality
@@ -303,7 +319,7 @@ class ProcessorTesterMixin:
        processor_components = self.prepare_components()
        processor = self.processor_class(**processor_components)
        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()

        # Define the kwargs for each modality
@@ -326,7 +342,7 @@ class ProcessorTesterMixin:
        processor = self.processor_class(**processor_components)
        self.skip_processor_without_typed_kwargs(processor)

-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs()
        image_input = self.prepare_image_inputs()

        with self.assertRaises(ValueError):