From 97d2f9d8aeea89d81ebf883640aa0d7dc561a46a Mon Sep 17 00:00:00 2001
From: Raushan Turganbay <raushan@huggingface.co>
Date: Fri, 21 Mar 2025 09:35:37 +0100
Subject: [PATCH] Mllama: raise better error (#35934)

* fix mllama

* update test

* fix test
---
 .../models/mllama/processing_mllama.py        |  8 +++--
 tests/models/mllama/test_processor_mllama.py  | 34 +++++++++++++++++++
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/mllama/processing_mllama.py b/src/transformers/models/mllama/processing_mllama.py
index 8e845ffd3a..d26d93bc3c 100644
--- a/src/transformers/models/mllama/processing_mllama.py
+++ b/src/transformers/models/mllama/processing_mllama.py
@@ -301,12 +301,16 @@ class MllamaProcessor(ProcessorMixin):
                 raise ValueError(
                     "If a batch of text is provided, there should be either no images or at least one image per sample"
                 )
-            if sum(n_images_in_images) != sum(n_images_in_text):
+            if sum(n_images_in_text) > 0 and n_images_in_images != n_images_in_text:
                 if images is None:
                     raise ValueError("No image were provided, but there are image tokens in the prompt")
                 else:
+                    add_message = ""
+                    if sum(n_images_in_images) == sum(n_images_in_text):
+                        add_message = "Make sure to pass your images as a nested list, where each sub-list holds images per batch"
                     raise ValueError(
-                        f"The number of image token ({sum(n_images_in_text)}) should be the same as in the number of provided images ({sum(n_images_in_images)})"
+                        f"The number of image tokens in each text ({n_images_in_text}) should be the same as the "
+                        f"number of provided images per batch ({n_images_in_images}). {add_message}"
                     )
 
         if images is not None:
diff --git a/tests/models/mllama/test_processor_mllama.py b/tests/models/mllama/test_processor_mllama.py
index 6d5db2f677..bbc1d3dfc8 100644
--- a/tests/models/mllama/test_processor_mllama.py
+++ b/tests/models/mllama/test_processor_mllama.py
@@ -327,6 +327,11 @@ class MllamaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         with self.assertRaises(ValueError):
             processor(text=text, images=None, padding=True)
 
+        # see https://github.com/huggingface/transformers/pull/35934
+        images = [self.image1, self.image2]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=None, padding=True)
+
     # Override as MllamaProcessor needs image tokens in prompts
     def prepare_text_inputs(self, batch_size: Optional[int] = None):
         if batch_size is None:
@@ -340,3 +345,32 @@ class MllamaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
         return ["lower newer <|image|>", "<|image|> upper older longer string"] + ["<|image|> lower newer"] * (
             batch_size - 2
         )
+
+    def test_unstructured_kwargs_batched(self):
+        # Overriden because Mllama expects images in nested format. For 2 images it can't infer
+        # the correct nesting, so we better throw an error
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        processor_components = self.prepare_components()
+        processor_kwargs = self.prepare_processor_dict()
+        processor = self.processor_class(**processor_components, **processor_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs(batch_size=2)
+        image_input = self.prepare_image_inputs(batch_size=2)
+        image_input = [[image_input[0]], [image_input[1]]]
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            do_rescale=True,
+            rescale_factor=-1,
+            padding="longest",
+            max_length=76,
+        )
+
+        self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
+        self.assertTrue(
+            len(inputs[self.text_input_name][0]) == len(inputs[self.text_input_name][1])
+            and len(inputs[self.text_input_name][1]) < 76
+        )