fix multi-image case for llava-onevision (#38084)

* _get_padding_size module

* do not patchify images when processing multi image

* modify llava onevision image processor fast

* tensor to list of tensors

* backward compat

* reuse pad_to_square in llave & some clarification

* add to doc

* fix: consider no image cases (text only or video)

* add integration test

* style & repo_consistency
This commit is contained in:
youngrok cha
2025-05-21 18:50:46 +09:00
committed by GitHub
parent a21f11fca2
commit 101b3fa4ea
13 changed files with 620 additions and 93 deletions

View File

@@ -202,7 +202,7 @@ class LlavaOnevisionImageProcessingTest(ImageProcessingTestMixin, unittest.TestC
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
# Test batched as a nested list of images, where each sublist is one batch
image_inputs_nested = [image_inputs[:3], image_inputs[3:]]
image_inputs_nested = [[image_input] for image_input in image_inputs]
encoded_images_nested = image_processing(image_inputs_nested, return_tensors="pt").pixel_values
expected_output_image_shape = (7, 1522, 3, 20, 20)
self.assertEqual(tuple(encoded_images_nested.shape), expected_output_image_shape)
@@ -210,6 +210,39 @@ class LlavaOnevisionImageProcessingTest(ImageProcessingTestMixin, unittest.TestC
# Image processor should return same pixel values, independently of input format
self.assertTrue((encoded_images_nested == encoded_images).all())
def test_multi_images(self):
length = 384
scale_single, scale_multi = 2, 3
image_processor_dict = self.image_processor_tester.prepare_image_processor_dict()
image_processor_dict["size"] = {"height": length, "width": length} # patch size
for image_processing_class in self.image_processor_list:
image_processing = image_processing_class(**image_processor_dict)
# Test batched as a nested list of images, where each sublist is one batch
len_image_1 = length * scale_single
image_inputs_1 = prepare_image_inputs(
batch_size=1,
min_resolution=0, # not used
max_resolution=len_image_1,
num_channels=3,
equal_resolution=True,
)
len_image_2 = length * scale_multi
image_inputs_2 = prepare_image_inputs(
batch_size=7,
min_resolution=0, # not used
max_resolution=len_image_2,
num_channels=3,
equal_resolution=True,
)
image_inputs = [image_inputs_1, image_inputs_2]
# Only single image should be patchified
expected_num_patches = scale_single**2 + 1 # +1 for base image patch
expected_output_image_shape = (8, expected_num_patches, 3, length, length)
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
@unittest.skip(
reason="LlavaOnevisionImageProcessorFast doesn't compile (infinitely) when using class transforms"
) # FIXME yoni

View File

@@ -460,6 +460,33 @@ class LlavaOnevisionForConditionalGenerationIntegrationTest(unittest.TestCase):
EXPECTED_DECODED_TEXT,
)
@slow
@require_bitsandbytes
def test_small_model_integration_test_multi_image_nested(self):
# related to (#34585)
model = LlavaOnevisionForConditionalGeneration.from_pretrained(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
torch_dtype="float16",
device_map=torch_device,
)
url = "https://www.ilankelman.org/stopsigns/australia.jpg"
image = Image.open(requests.get(url, stream=True).raw)
prompt = (
"user\n<image><image>\nWhat is the difference between these images?<|im_end|>\n<|im_start|>assistant\n"
)
images_nested = [[self.image, image]]
inputs = self.processor(text=prompt, images=images_nested, return_tensors="pt").to(torch_device, torch.float16)
# verify generation
output = model.generate(**inputs, max_new_tokens=40)
EXPECTED_DECODED_TEXT = "user\n\nWhat is the difference between these images?\nassistant\nThe first image is a radar chart showing the performance of different models in a specific task, while the second image is a street scene with a stop sign in the foreground." # fmt: skip
self.assertEqual(
self.processor.decode(output[0], skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
@slow
@require_bitsandbytes
def test_small_model_integration_test_multi_video(self):

View File

@@ -233,7 +233,7 @@ class ImageProcessingTestMixin:
avg_time = sum(sorted(all_times[:3])) / 3.0
return avg_time
dummy_images = torch.randint(0, 255, (4, 3, 224, 224), dtype=torch.uint8)
dummy_images = [torch.randint(0, 255, (3, 224, 224), dtype=torch.uint8) for _ in range(4)]
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)