From 97d2f9d8aeea89d81ebf883640aa0d7dc561a46a Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Fri, 21 Mar 2025 09:35:37 +0100 Subject: [PATCH] Mllama: raise better error (#35934) * fix mllama * update test * fix test --- .../models/mllama/processing_mllama.py | 8 +++-- tests/models/mllama/test_processor_mllama.py | 34 +++++++++++++++++++ 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/mllama/processing_mllama.py b/src/transformers/models/mllama/processing_mllama.py index 8e845ffd3a..d26d93bc3c 100644 --- a/src/transformers/models/mllama/processing_mllama.py +++ b/src/transformers/models/mllama/processing_mllama.py @@ -301,12 +301,16 @@ class MllamaProcessor(ProcessorMixin): raise ValueError( "If a batch of text is provided, there should be either no images or at least one image per sample" ) - if sum(n_images_in_images) != sum(n_images_in_text): + if sum(n_images_in_text) > 0 and n_images_in_images != n_images_in_text: if images is None: raise ValueError("No image were provided, but there are image tokens in the prompt") else: + add_message = "" + if sum(n_images_in_images) == sum(n_images_in_text): + add_message = "Make sure to pass your images as a nested list, where each sub-list holds images per batch" raise ValueError( - f"The number of image token ({sum(n_images_in_text)}) should be the same as in the number of provided images ({sum(n_images_in_images)})" + f"The number of image tokens in each text ({n_images_in_text}) should be the same as the " + f"number of provided images per batch ({n_images_in_images}). {add_message}" ) if images is not None: diff --git a/tests/models/mllama/test_processor_mllama.py b/tests/models/mllama/test_processor_mllama.py index 6d5db2f677..bbc1d3dfc8 100644 --- a/tests/models/mllama/test_processor_mllama.py +++ b/tests/models/mllama/test_processor_mllama.py @@ -327,6 +327,11 @@ class MllamaProcessorTest(ProcessorTesterMixin, unittest.TestCase): with self.assertRaises(ValueError): processor(text=text, images=None, padding=True) + # see https://github.com/huggingface/transformers/pull/35934 + images = [self.image1, self.image2] + with self.assertRaises(ValueError): + processor(text=text, images=None, padding=True) + # Override as MllamaProcessor needs image tokens in prompts def prepare_text_inputs(self, batch_size: Optional[int] = None): if batch_size is None: @@ -340,3 +345,32 @@ class MllamaProcessorTest(ProcessorTesterMixin, unittest.TestCase): return ["lower newer <|image|>", "<|image|> upper older longer string"] + ["<|image|> lower newer"] * ( batch_size - 2 ) + + def test_unstructured_kwargs_batched(self): + # Overriden because Mllama expects images in nested format. For 2 images it can't infer + # the correct nesting, so we better throw an error + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + processor_components = self.prepare_components() + processor_kwargs = self.prepare_processor_dict() + processor = self.processor_class(**processor_components, **processor_kwargs) + self.skip_processor_without_typed_kwargs(processor) + + input_str = self.prepare_text_inputs(batch_size=2) + image_input = self.prepare_image_inputs(batch_size=2) + image_input = [[image_input[0]], [image_input[1]]] + inputs = processor( + text=input_str, + images=image_input, + return_tensors="pt", + do_rescale=True, + rescale_factor=-1, + padding="longest", + max_length=76, + ) + + self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0) + self.assertTrue( + len(inputs[self.text_input_name][0]) == len(inputs[self.text_input_name][1]) + and len(inputs[self.text_input_name][1]) < 76 + )