fix multi-image case for llava-onevision (#38084)

* _get_padding_size module * do not patchify images when processing multi image * modify llava onevision image processor fast * tensor to list of tensors * backward compat * reuse pad_to_square in llave & some clarification * add to doc * fix: consider no image cases (text only or video) * add integration test * style & repo_consistency
2025-05-21 18:50:46 +09:00
parent a21f11fca2
commit 101b3fa4ea
13 changed files with 620 additions and 93 deletions
--- a/tests/models/llava_onevision/test_image_processing_llava_onevision.py
+++ b/tests/models/llava_onevision/test_image_processing_llava_onevision.py
@@ -202,7 +202,7 @@ class LlavaOnevisionImageProcessingTest(ImageProcessingTestMixin, unittest.TestC
            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)

            # Test batched as a nested list of images, where each sublist is one batch
-            image_inputs_nested = [image_inputs[:3], image_inputs[3:]]
+            image_inputs_nested = [[image_input] for image_input in image_inputs]
            encoded_images_nested = image_processing(image_inputs_nested, return_tensors="pt").pixel_values
            expected_output_image_shape = (7, 1522, 3, 20, 20)
            self.assertEqual(tuple(encoded_images_nested.shape), expected_output_image_shape)
@@ -210,6 +210,39 @@ class LlavaOnevisionImageProcessingTest(ImageProcessingTestMixin, unittest.TestC
            # Image processor should return same pixel values, independently of input format
            self.assertTrue((encoded_images_nested == encoded_images).all())

+    def test_multi_images(self):
+        length = 384
+        scale_single, scale_multi = 2, 3
+        image_processor_dict = self.image_processor_tester.prepare_image_processor_dict()
+        image_processor_dict["size"] = {"height": length, "width": length}  # patch size
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**image_processor_dict)
+
+            # Test batched as a nested list of images, where each sublist is one batch
+            len_image_1 = length * scale_single
+            image_inputs_1 = prepare_image_inputs(
+                batch_size=1,
+                min_resolution=0,  # not used
+                max_resolution=len_image_1,
+                num_channels=3,
+                equal_resolution=True,
+            )
+            len_image_2 = length * scale_multi
+            image_inputs_2 = prepare_image_inputs(
+                batch_size=7,
+                min_resolution=0,  # not used
+                max_resolution=len_image_2,
+                num_channels=3,
+                equal_resolution=True,
+            )
+            image_inputs = [image_inputs_1, image_inputs_2]
+
+            # Only single image should be patchified
+            expected_num_patches = scale_single**2 + 1  # +1 for base image patch
+            expected_output_image_shape = (8, expected_num_patches, 3, length, length)
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+
    @unittest.skip(
        reason="LlavaOnevisionImageProcessorFast doesn't compile (infinitely) when using class transforms"
    )  # FIXME yoni
--- a/tests/models/llava_onevision/test_modeling_llava_onevision.py
+++ b/tests/models/llava_onevision/test_modeling_llava_onevision.py
@@ -460,6 +460,33 @@ class LlavaOnevisionForConditionalGenerationIntegrationTest(unittest.TestCase):
            EXPECTED_DECODED_TEXT,
        )

+    @slow
+    @require_bitsandbytes
+    def test_small_model_integration_test_multi_image_nested(self):
+        # related to (#34585)
+        model = LlavaOnevisionForConditionalGeneration.from_pretrained(
+            "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+            torch_dtype="float16",
+            device_map=torch_device,
+        )
+
+        url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        image = Image.open(requests.get(url, stream=True).raw)
+        prompt = (
+            "user\n<image><image>\nWhat is the difference between these images?<|im_end|>\n<|im_start|>assistant\n"
+        )
+        images_nested = [[self.image, image]]
+        inputs = self.processor(text=prompt, images=images_nested, return_tensors="pt").to(torch_device, torch.float16)
+
+        # verify generation
+        output = model.generate(**inputs, max_new_tokens=40)
+        EXPECTED_DECODED_TEXT = "user\n\nWhat is the difference between these images?\nassistant\nThe first image is a radar chart showing the performance of different models in a specific task, while the second image is a street scene with a stop sign in the foreground."  # fmt: skip
+
+        self.assertEqual(
+            self.processor.decode(output[0], skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT,
+        )
+
    @slow
    @require_bitsandbytes
    def test_small_model_integration_test_multi_video(self):
--- a/tests/test_image_processing_common.py
+++ b/tests/test_image_processing_common.py
@@ -233,7 +233,7 @@ class ImageProcessingTestMixin:
            avg_time = sum(sorted(all_times[:3])) / 3.0
            return avg_time

-        dummy_images = torch.randint(0, 255, (4, 3, 224, 224), dtype=torch.uint8)
+        dummy_images = [torch.randint(0, 255, (3, 224, 224), dtype=torch.uint8) for _ in range(4)]
        image_processor_slow = self.image_processing_class(**self.image_processor_dict)
        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)