Pixtral: vectorize patch embeddings and enable tests (#35122)

* initial POC * - batch mix feature * fix tests * fix tests * make style * do not skip and instead fix tests * update * return back the test * correct text with the correct ckpt
2025-01-30 12:40:18 +01:00
parent 8bc4c89ee9
commit 9725e5be2f
10 changed files with 422 additions and 545 deletions
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -564,9 +564,8 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
        self.assertTrue(processor.batch_decode(output, skip_special_tokens=True)[0] == EXPECTED_DECODED_TEXT)

    @slow
-    @require_bitsandbytes
    def test_pixtral(self):
-        model_id = "hf-internal-testing/pixtral-12b"
+        model_id = "mistral-community/pixtral-12b"
        model = LlavaForConditionalGeneration.from_pretrained(model_id)
        processor = AutoProcessor.from_pretrained(model_id)

@@ -579,33 +578,75 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
        PROMPT = "<s>[INST]Describe the images.\n[IMG][IMG][IMG][IMG][/INST]"

        # image = Image.open(requests.get(url, stream=True).raw)
-        inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to("cuda")
+        inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to(model.device)
        generate_ids = model.generate(**inputs, max_new_tokens=500)
        ouptut = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        print(ouptut)

        # fmt: off
        EXPECTED_GENERATION = """
 Describe the images.
-Sure, let's break down each image description:
+Certainly! Here are the descriptions of the images:

-1. **Image 1:**
-   - **Description:** A black dog with a glossy coat is sitting on a wooden floor. The dog has a focused expression and is looking directly at the camera.
-   - **Details:** The wooden floor has a rustic appearance with visible wood grain patterns. The dog's eyes are a striking color, possibly brown or amber, which contrasts with its black fur.
+1. **Image 1**: This image features a black dog with a glossy coat sitting on a wooden surface. The dog has a calm and attentive expression, looking directly at the camera. The wooden background has a rustic appearance with visible grain and texture.

-2. **Image 2:**
-   - **Description:** A scenic view of a mountainous landscape with a winding road cutting through it. The road is surrounded by lush green vegetation and leads to a distant valley.
-   - **Details:** The mountains are rugged with steep slopes, and the sky is clear, indicating good weather. The winding road adds a sense of depth and perspective to the image.
+2. **Image 2**: This image captures a breathtaking view of a mountainous landscape. The mountains are rugged and covered with patches of green vegetation. The sky above is clear, and the scene conveys a sense of tranquility and natural beauty.

-3. **Image 3:**
-   - **Description:** A beach scene with waves crashing against the shore. There are several people in the water and on the beach, enjoying the waves and the sunset.
-   - **Details:** The waves are powerful, creating a dynamic and lively atmosphere. The sky is painted with hues of orange and pink from the setting sun, adding a warm glow to the scene.
+3. **Image 3**: This image shows a beach scene during sunset. The waves are gently rolling onto the shore, and several people can be seen in the water, possibly surfing or swimming. The sky is painted with warm hues of orange and yellow, creating a serene and picturesque atmosphere.

-4. **Image 4:**
-   - **Description:** A garden path leading to a large tree with a bench underneath it. The path is bordered by well-maintained grass and flowers.
-   - **Details:** The path is made of small stones or gravel, and the tree provides a shaded area with the bench invitingly placed beneath it. The surrounding area is lush and green, suggesting a well-kept garden.
+4. **Image 4**: This image depicts a narrow, winding path that cuts through a lush, green landscape. On either side of the path, there is dense grass and various trees, including a prominent tree with white blossoms. The sky is clear and blue, adding to the peaceful and inviting ambiance of the scene.

-Each image captures a different scene, from a close-up of a dog to expansive natural landscapes, showcasing various elements of nature and human interaction with it.
+These descriptions provide a detailed overview of the content and atmosphere of each image.
 """
        # fmt: on
        # check that both inputs are handled correctly and generate the same output
-        self.assertListEqual(ouptut, EXPECTED_GENERATION)
+        self.assertEqual(ouptut, EXPECTED_GENERATION)
+
+    @slow
+    @require_bitsandbytes
+    def test_pixtral_4bit(self):
+        model_id = "mistral-community/pixtral-12b"
+        model = LlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        IMG_URLS = [
+            Image.open(requests.get("https://picsum.photos/id/237/400/300", stream=True).raw),
+            Image.open(requests.get("https://picsum.photos/id/231/200/300", stream=True).raw),
+        ]
+        PROMPT = "<s>[INST][IMG][IMG]Describe the images.[/INST]"
+
+        inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to(torch_device, torch.float16)
+        generate_ids = model.generate(**inputs, max_new_tokens=50)
+        output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+
+        EXPECTED_GENERATION = "Describe the images.The image showcases a dog, which is prominently positioned in the center, taking up a significant portion of the frame. The dog is situated against a backdrop of a wooden surface, which spans the entire image. The dog appears to be a black Labrador"  # fmt: skip
+        self.assertEqual(output, EXPECTED_GENERATION)
+
+    @slow
+    @require_bitsandbytes
+    def test_pixtral_batched(self):
+        model_id = "mistral-community/pixtral-12b"
+        model = LlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
+        processor = AutoProcessor.from_pretrained(model_id)
+        processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id
+
+        IMG_URLS = [
+            Image.open(requests.get("https://picsum.photos/id/237/400/300", stream=True).raw),
+            Image.open(requests.get("https://picsum.photos/id/17/150/500", stream=True).raw),
+        ]
+        PROMPT = [
+            "<s>[INST][IMG]What breed is the dog?[/INST]",
+            "<s>[INST][IMG]What is shown in this image?[/INST]",
+        ]
+
+        inputs = processor(text=PROMPT, images=IMG_URLS, padding=True, return_tensors="pt").to(
+            torch_device, torch.float16
+        )
+        generate_ids = model.generate(**inputs, max_new_tokens=50)
+        output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+
+        EXPECTED_GENERATION = [
+            'What breed is the dog?The dog in the image is a black Labrador Retriever.',
+            'What is shown in this image?The image depicts a narrow, winding dirt path surrounded by lush greenery. The path is flanked by grass and shrubs on both sides. On the left side, there are tall trees and dense foliage, while on the right side, there'
+        ]  # fmt: skip
+        self.assertEqual(output, EXPECTED_GENERATION)
--- a/tests/models/pixtral/test_image_processing_pixtral.py
+++ b/tests/models/pixtral/test_image_processing_pixtral.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import random
 import time
 import unittest

@@ -92,49 +91,47 @@ class PixtralImageProcessingTester:
            "do_convert_rgb": self.do_convert_rgb,
        }

-    def expected_output_image_shape(self, image):
-        if isinstance(image, Image.Image):
-            width, height = image.size
-        elif isinstance(image, np.ndarray):
-            height, width = image.shape[:2]
-        elif isinstance(image, torch.Tensor):
-            height, width = image.shape[-2:]
+    def expected_output_image_shape(self, images):
+        if not isinstance(images, (list, tuple)):
+            images = [images]

-        max_height = max_width = self.size.get("longest_edge")
+        batch_size = len(images)
+        return_height, return_width = 0, 0
+        for image in images:
+            if isinstance(image, Image.Image):
+                width, height = image.size
+            elif isinstance(image, np.ndarray):
+                height, width = image.shape[:2]
+            elif isinstance(image, torch.Tensor):
+                height, width = image.shape[-2:]

-        ratio = max(height / max_height, width / max_width)
-        if ratio > 1:
-            height = int(np.ceil(height / ratio))
-            width = int(np.ceil(width / ratio))
+            max_height = max_width = self.size.get("longest_edge")

-        patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
-        num_height_tokens = (height - 1) // patch_height + 1
-        num_width_tokens = (width - 1) // patch_width + 1
+            ratio = max(height / max_height, width / max_width)
+            if ratio > 1:
+                height = int(np.ceil(height / ratio))
+                width = int(np.ceil(width / ratio))

-        height = num_height_tokens * patch_height
-        width = num_width_tokens * patch_width
+            patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
+            num_height_tokens = (height - 1) // patch_height + 1
+            num_width_tokens = (width - 1) // patch_width + 1

-        return self.num_channels, height, width
+            return_height = max(num_height_tokens * patch_height, return_height)
+            return_width = max(num_width_tokens * patch_width, return_width)
+
+        return batch_size, self.num_channels, return_height, return_width

    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        # Use prepare_image_inputs to make a list of list of single images
-
-        images_list = []
-        for _ in range(self.batch_size):
-            images = []
-            for _ in range(random.randint(1, self.max_num_images_per_sample)):
-                img = prepare_image_inputs(
-                    batch_size=1,
-                    num_channels=self.num_channels,
-                    min_resolution=self.min_resolution,
-                    max_resolution=self.max_resolution,
-                    equal_resolution=equal_resolution,
-                    numpify=numpify,
-                    torchify=torchify,
-                )[0]
-                images.append(img)
-            images_list.append(images)
-        return images_list
+        images = prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+        return images


@require_torch
@@ -173,23 +170,18 @@ class PixtralImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
            image_processing = image_processing_class(**self.image_processor_dict)
            # create random PIL images
            image_inputs_list = self.image_processor_tester.prepare_image_inputs()
-            for image_inputs in image_inputs_list:
-                for image in image_inputs:
-                    self.assertIsInstance(image, Image.Image)
+            for image in image_inputs_list:
+                self.assertIsInstance(image, Image.Image)

            # Test not batched input
-            encoded_images = image_processing(image_inputs_list[0][0], return_tensors="pt").pixel_values
-            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(
-                image_inputs_list[0][0]
-            )
-            self.assertEqual(tuple(encoded_images[0][0].shape), expected_output_image_shape)
+            encoded_images = image_processing(image_inputs_list[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs_list[0])
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)

            # Test batched
-            batch_encoded_images = image_processing(image_inputs_list, return_tensors="pt").pixel_values
-            for encoded_images, images in zip(batch_encoded_images, image_inputs_list):
-                for encoded_image, image in zip(encoded_images, images):
-                    expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image)
-                    self.assertEqual(tuple(encoded_image.shape), expected_output_image_shape)
+            encoded_images = image_processing(image_inputs_list, return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs_list)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)

    def test_call_numpy(self):
        for image_processing_class in self.image_processor_list:
@@ -197,23 +189,18 @@ class PixtralImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
            image_processing = image_processing_class(**self.image_processor_dict)
            # create random numpy tensors
            image_inputs_list = self.image_processor_tester.prepare_image_inputs(numpify=True)
-            for image_inputs in image_inputs_list:
-                for image in image_inputs:
-                    self.assertIsInstance(image, np.ndarray)
+            for image in image_inputs_list:
+                self.assertIsInstance(image, np.ndarray)

            # Test not batched input
-            encoded_images = image_processing(image_inputs_list[0][0], return_tensors="pt").pixel_values
-            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(
-                image_inputs_list[0][0]
-            )
-            self.assertEqual(tuple(encoded_images[0][0].shape), expected_output_image_shape)
+            encoded_images = image_processing(image_inputs_list[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs_list[0])
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)

            # Test batched
            batch_encoded_images = image_processing(image_inputs_list, return_tensors="pt").pixel_values
-            for encoded_images, images in zip(batch_encoded_images, image_inputs_list):
-                for encoded_image, image in zip(encoded_images, images):
-                    expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image)
-                    self.assertEqual(tuple(encoded_image.shape), expected_output_image_shape)
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs_list)
+            self.assertEqual(tuple(batch_encoded_images.shape), expected_output_image_shape)

    def test_call_pytorch(self):
        for image_processing_class in self.image_processor_list:
@@ -221,23 +208,18 @@ class PixtralImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
            image_processing = image_processing_class(**self.image_processor_dict)
            # create random PyTorch tensors
            image_inputs_list = self.image_processor_tester.prepare_image_inputs(torchify=True)
-            for image_inputs in image_inputs_list:
-                for image in image_inputs:
-                    self.assertIsInstance(image, torch.Tensor)
+            for image in image_inputs_list:
+                self.assertIsInstance(image, torch.Tensor)

            # Test not batched input
-            encoded_images = image_processing(image_inputs_list[0][0], return_tensors="pt").pixel_values
-            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(
-                image_inputs_list[0][0]
-            )
-            self.assertEqual(tuple(encoded_images[0][0].shape), expected_output_image_shape)
+            encoded_images = image_processing(image_inputs_list[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs_list[0])
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)

            # Test batched
            batch_encoded_images = image_processing(image_inputs_list, return_tensors="pt").pixel_values
-            for encoded_images, images in zip(batch_encoded_images, image_inputs_list):
-                for encoded_image, image in zip(encoded_images, images):
-                    expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image)
-                    self.assertEqual(tuple(encoded_image.shape), expected_output_image_shape)
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs_list)
+            self.assertEqual(tuple(batch_encoded_images.shape), expected_output_image_shape)

    @require_vision
    @require_torch
--- a/tests/models/pixtral/test_modeling_pixtral.py
+++ b/tests/models/pixtral/test_modeling_pixtral.py
@@ -74,15 +74,17 @@ class PixtralVisionModelTester:
        self.initializer_range = initializer_range
        self.scope = scope

-        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
+        # in Pixtral, the seq length equals the number of patches * batch_size because the patches are flattened
+        self.seq_length = (image_size // patch_size) ** 2 * batch_size

    def prepare_config_and_inputs(self):
        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        image_sizes = torch.tensor(
+            [[self.image_size, self.image_size]] * self.batch_size, dtype=torch.long, device=torch_device
+        )
        config = self.get_config()

-        return config, pixel_values
+        return config, pixel_values, image_sizes

    def get_config(self):
        return PixtralVisionConfig(
@@ -127,8 +129,8 @@ class PixtralVisionModelTester:

    def prepare_config_and_inputs_for_common(self):
        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
+        config, pixel_values, image_sizes = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values, "image_sizes": image_sizes}
        return config, inputs_dict


@@ -142,113 +144,17 @@ class PixtralVisionModelModelTest(ModelTesterMixin, unittest.TestCase):
    test_pruning = False
    test_head_masking = False
    test_torchscript = False
+    test_resize_embeddings = False

    def setUp(self):
        self.model_tester = PixtralVisionModelTester(self)
        self.config_tester = ConfigTester(self, config_class=PixtralVisionConfig, has_text_modality=False)

-    @unittest.skip("model does not support input embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip("model does not support input embeds")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="Compile not yet supported because in Pixtral models")
-    def test_sdpa_can_compile_dynamic(self):
-        pass
-
-    @unittest.skip(reason="Compile not yet supported because in Pixtral models")
-    def test_sdpa_can_dispatch_on_flash(self):
-        pass
-
-    @unittest.skip(reason="Not supported yet")
-    def test_attention_outputs(self):
-        pass
-
-    @unittest.skip(reason="Not supported yet")
-    def test_cpu_offload(self):
-        pass
-
-    @unittest.skip(reason="Not supported yet")
-    def test_batching_equivalence(self):
-        pass
-
-    @unittest.skip(reason="Not supported yet")
-    def test_disk_offload_bin(self):
-        pass
-
-    @unittest.skip(reason="Not supported yet")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="Not supported yet")
-    def test_multi_gpu_data_parallel_forward(self):
-        pass
-
-    @unittest.skip(reason="Not supported yet")
-    def test_model_parallelism(self):
-        pass
-
-    @unittest.skip(reason="Not supported yet")
-    def test_model_outputs_equivalence(self):
-        pass
-
-    @unittest.skip(reason="Not supported yet")
-    def test_save_load(self):
-        pass
-
-    @unittest.skip(reason="Not supported yet")
    def test_model_get_set_embeddings(self):
-        pass
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()

-    @unittest.skip(reason="Not supported yet")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Not supported yet")
-    def test_model_main_input_name(self):
-        pass
-
-    @unittest.skip(reason="Not supported yet")
-    def test_initialization(self):
-        pass
-
-    @unittest.skip(reason="Not supported yet")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="Not supported yet")
-    def test_gradient_checkpointing_backward_compatibility(self):
-        pass
-
-    @unittest.skip(reason="Not supported yet")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @unittest.skip(reason="Not supported yet")
-    def test_disk_offload_safetensors(self):
-        pass
-
-    @unittest.skip(reason="Not supported yet")
-    def test_determinism(self):
-        pass
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, torch.nn.Linear))
--- a/tests/models/pixtral/test_processor_pixtral.py
+++ b/tests/models/pixtral/test_processor_pixtral.py
@@ -14,7 +14,6 @@
 import shutil
 import tempfile
 import unittest
-from typing import Optional

 import requests
 import torch
@@ -28,7 +27,7 @@ from ...test_processing_common import ProcessorTesterMixin
 if is_vision_available():
    from PIL import Image

-    from transformers import AutoTokenizer, PixtralImageProcessor, PixtralProcessor
+    from transformers import PixtralProcessor


@require_vision
@@ -46,20 +45,15 @@ class PixtralProcessorTest(ProcessorTesterMixin, unittest.TestCase):

    def setUp(self):
        self.tmpdirname = tempfile.mkdtemp()
-
-        # FIXME - just load the processor directly from the checkpoint
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/pixtral-12b")
-        image_processor = PixtralImageProcessor()
-        processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        processor = PixtralProcessor.from_pretrained("mistral-community/pixtral-12b")
        processor.save_pretrained(self.tmpdirname)

    def tearDown(self):
        shutil.rmtree(self.tmpdirname)

-    @unittest.skip("No chat template was set for this model (yet)")
    def test_chat_template(self):
        processor = self.processor_class.from_pretrained(self.tmpdirname)
-        expected_prompt = "USER: [IMG]\nWhat is shown in this image? ASSISTANT:"
+        expected_prompt = "<s>[INST][IMG]What is shown in this image?[/INST]"

        messages = [
            {
@@ -73,13 +67,12 @@ class PixtralProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
        self.assertEqual(expected_prompt, formatted_prompt)

-    @unittest.skip("No chat template was set for this model (yet)")
    def test_image_token_filling(self):
        processor = self.processor_class.from_pretrained(self.tmpdirname)
        # Important to check with non square image
        image = torch.randint(0, 2, (3, 500, 316))
-        expected_image_tokens = 1526
-        image_token_index = 32000
+        expected_image_tokens = 640
+        image_token_index = 10

        messages = [
            {
@@ -111,11 +104,8 @@ class PixtralProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertIn("input_ids", inputs_image)
        self.assertTrue(len(inputs_image["input_ids"]) == 1)
        self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
-        self.assertIsInstance(inputs_image["pixel_values"], list)
-        self.assertTrue(len(inputs_image["pixel_values"]) == 1)
-        self.assertIsInstance(inputs_image["pixel_values"][0], list)
-        self.assertTrue(len(inputs_image["pixel_values"][0]) == 1)
-        self.assertIsInstance(inputs_image["pixel_values"][0][0], torch.Tensor)
+        self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 32, 32]))

        # fmt: off
        input_ids = inputs_image["input_ids"]
@@ -131,11 +121,8 @@ class PixtralProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertIn("input_ids", inputs_url)
        self.assertTrue(len(inputs_url["input_ids"]) == 1)
        self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
-        self.assertIsInstance(inputs_url["pixel_values"], list)
-        self.assertTrue(len(inputs_url["pixel_values"]) == 1)
-        self.assertIsInstance(inputs_url["pixel_values"][0], list)
-        self.assertTrue(len(inputs_url["pixel_values"][0]) == 1)
-        self.assertIsInstance(inputs_url["pixel_values"][0][0], torch.Tensor)
+        self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 32, 32]))

        # fmt: off
        input_ids = inputs_url["input_ids"]
@@ -146,6 +133,28 @@ class PixtralProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        )
        # fmt: on

+        # Test passing inputs as a single list
+        inputs_image = processor(text=prompt_string, images=[self.image_0], return_tensors="pt")
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 32, 32]))
+
+        # fmt: off
+        self.assertEqual(
+            inputs_image["input_ids"][0].tolist(),
+            [21510,  1058,  1032,    10,    10,    12,    10,    10,    13,  1010, 7493,  1681,  1278,  4701,  1307,  1278,  3937,  1063,  1349,  4290, 16002, 41150,  1058]
+        )
+        # fmt: on
+
+        # Test as nested single list
+        inputs_image = processor(text=prompt_string, images=[[self.image_0]], return_tensors="pt")
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 32, 32]))
+
+        # fmt: off
+        self.assertEqual(
+            inputs_image["input_ids"][0].tolist(),
+            [21510,  1058,  1032,    10,    10,    12,    10,    10,    13,  1010, 7493,  1681,  1278,  4701,  1307,  1278,  3937,  1063,  1349,  4290, 16002, 41150,  1058]
+        )
+        # fmt: on
+
    def test_processor_with_multiple_images_single_list(self):
        processor = self.processor_class.from_pretrained(self.tmpdirname)
        prompt_string = "USER: [IMG][IMG]\nWhat's the difference between these two images? ASSISTANT:"
@@ -159,11 +168,8 @@ class PixtralProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertIn("input_ids", inputs_image)
        self.assertTrue(len(inputs_image["input_ids"]) == 1)
        self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
-        self.assertIsInstance(inputs_image["pixel_values"], list)
-        self.assertTrue(len(inputs_image["pixel_values"]) == 1)
-        self.assertIsInstance(inputs_image["pixel_values"][0], list)
-        self.assertTrue(len(inputs_image["pixel_values"][0]) == 2)
-        self.assertIsInstance(inputs_image["pixel_values"][0][0], torch.Tensor)
+        self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([2, 3, 32, 32]))

        # fmt: off
        input_ids = inputs_image["input_ids"]
@@ -179,11 +185,9 @@ class PixtralProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertIn("input_ids", inputs_url)
        self.assertTrue(len(inputs_url["input_ids"]) == 1)
        self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
-        self.assertIsInstance(inputs_url["pixel_values"], list)
-        self.assertTrue(len(inputs_url["pixel_values"]) == 1)
-        self.assertIsInstance(inputs_url["pixel_values"][0], list)
-        self.assertTrue(len(inputs_url["pixel_values"][0]) == 2)
-        self.assertIsInstance(inputs_url["pixel_values"][0][0], torch.Tensor)
+        self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([2, 3, 32, 32]))
+
        # fmt: off
        input_ids = inputs_url["input_ids"]
        self.assertEqual(
@@ -193,6 +197,17 @@ class PixtralProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        )
        # fmt: on

+        # Test passing in as a nested list
+        inputs_url = processor(text=prompt_string, images=[[self.image_0, self.image_1]], return_tensors="pt")
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([2, 3, 32, 32]))
+
+        # fmt: off
+        self.assertEqual(
+            inputs_url["input_ids"][0].tolist(),
+            [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
+        )
+        # fmt: on
+
    def test_processor_with_multiple_images_multiple_lists(self):
        processor = self.processor_class.from_pretrained(self.tmpdirname)
        prompt_string = [
@@ -211,11 +226,8 @@ class PixtralProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertIn("input_ids", inputs_image)
        self.assertTrue(len(inputs_image["input_ids"]) == 2)
        self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
-        self.assertIsInstance(inputs_image["pixel_values"], list)
-        self.assertTrue(len(inputs_image["pixel_values"]) == 2)
-        self.assertIsInstance(inputs_image["pixel_values"][0], list)
-        self.assertTrue(len(inputs_image["pixel_values"][0]) == 2)
-        self.assertIsInstance(inputs_image["pixel_values"][0][0], torch.Tensor)
+        self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([3, 3, 32, 32]))

        # fmt: off
        input_ids = inputs_image["input_ids"]
@@ -231,11 +243,8 @@ class PixtralProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertIn("input_ids", inputs_url)
        self.assertTrue(len(inputs_url["input_ids"]) == 2)
        self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
-        self.assertIsInstance(inputs_url["pixel_values"], list)
-        self.assertTrue(len(inputs_url["pixel_values"]) == 2)
-        self.assertIsInstance(inputs_url["pixel_values"][0], list)
-        self.assertTrue(len(inputs_url["pixel_values"][0]) == 2)
-        self.assertIsInstance(inputs_url["pixel_values"][0][0], torch.Tensor)
+        self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([3, 3, 32, 32]))

        # fmt: off
        input_ids = inputs_url["input_ids"]
@@ -246,6 +255,19 @@ class PixtralProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        )
        # fmt: on

+        # Test passing as a single flat list
+        inputs_image = processor(
+            text=prompt_string, images=[self.image_0, self.image_1, self.image_2], return_tensors="pt", padding=True
+        )
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([3, 3, 32, 32]))
+
+        # fmt: off
+        self.assertEqual(
+            inputs_image["input_ids"][0].tolist(),
+            [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
+        )
+        # fmt: on
+
    def test_processor_returns_full_length_batches(self):
        # to avoid https://github.com/huggingface/transformers/issues/34204
        processor = self.processor_class.from_pretrained(self.tmpdirname)
@@ -264,13 +286,3 @@ class PixtralProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertIn("input_ids", inputs_image)
        self.assertTrue(len(inputs_image["input_ids"]) == 5)
        self.assertTrue(len(inputs_image["pixel_values"]) == 5)
-
-    # Override as PixtralProcessor needs nested images to work properly with batched inputs
-    @require_vision
-    def prepare_image_inputs(self, batch_size: Optional[int] = None):
-        """This function prepares a list of PIL images for testing"""
-        if batch_size is None:
-            return super().prepare_image_inputs()
-        if batch_size < 1:
-            raise ValueError("batch_size must be greater than 0")
-        return [[super().prepare_image_inputs()]] * batch_size