fix multi-image case for llava-onevision (#38084)
* _get_padding_size module * do not patchify images when processing multi image * modify llava onevision image processor fast * tensor to list of tensors * backward compat * reuse pad_to_square in llave & some clarification * add to doc * fix: consider no image cases (text only or video) * add integration test * style & repo_consistency
This commit is contained in:
@@ -202,7 +202,7 @@ class LlavaOnevisionImageProcessingTest(ImageProcessingTestMixin, unittest.TestC
|
||||
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
|
||||
|
||||
# Test batched as a nested list of images, where each sublist is one batch
|
||||
image_inputs_nested = [image_inputs[:3], image_inputs[3:]]
|
||||
image_inputs_nested = [[image_input] for image_input in image_inputs]
|
||||
encoded_images_nested = image_processing(image_inputs_nested, return_tensors="pt").pixel_values
|
||||
expected_output_image_shape = (7, 1522, 3, 20, 20)
|
||||
self.assertEqual(tuple(encoded_images_nested.shape), expected_output_image_shape)
|
||||
@@ -210,6 +210,39 @@ class LlavaOnevisionImageProcessingTest(ImageProcessingTestMixin, unittest.TestC
|
||||
# Image processor should return same pixel values, independently of input format
|
||||
self.assertTrue((encoded_images_nested == encoded_images).all())
|
||||
|
||||
def test_multi_images(self):
|
||||
length = 384
|
||||
scale_single, scale_multi = 2, 3
|
||||
image_processor_dict = self.image_processor_tester.prepare_image_processor_dict()
|
||||
image_processor_dict["size"] = {"height": length, "width": length} # patch size
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processing = image_processing_class(**image_processor_dict)
|
||||
|
||||
# Test batched as a nested list of images, where each sublist is one batch
|
||||
len_image_1 = length * scale_single
|
||||
image_inputs_1 = prepare_image_inputs(
|
||||
batch_size=1,
|
||||
min_resolution=0, # not used
|
||||
max_resolution=len_image_1,
|
||||
num_channels=3,
|
||||
equal_resolution=True,
|
||||
)
|
||||
len_image_2 = length * scale_multi
|
||||
image_inputs_2 = prepare_image_inputs(
|
||||
batch_size=7,
|
||||
min_resolution=0, # not used
|
||||
max_resolution=len_image_2,
|
||||
num_channels=3,
|
||||
equal_resolution=True,
|
||||
)
|
||||
image_inputs = [image_inputs_1, image_inputs_2]
|
||||
|
||||
# Only single image should be patchified
|
||||
expected_num_patches = scale_single**2 + 1 # +1 for base image patch
|
||||
expected_output_image_shape = (8, expected_num_patches, 3, length, length)
|
||||
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
|
||||
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
|
||||
|
||||
@unittest.skip(
|
||||
reason="LlavaOnevisionImageProcessorFast doesn't compile (infinitely) when using class transforms"
|
||||
) # FIXME yoni
|
||||
|
||||
@@ -460,6 +460,33 @@ class LlavaOnevisionForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
EXPECTED_DECODED_TEXT,
|
||||
)
|
||||
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test_multi_image_nested(self):
|
||||
# related to (#34585)
|
||||
model = LlavaOnevisionForConditionalGeneration.from_pretrained(
|
||||
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
|
||||
torch_dtype="float16",
|
||||
device_map=torch_device,
|
||||
)
|
||||
|
||||
url = "https://www.ilankelman.org/stopsigns/australia.jpg"
|
||||
image = Image.open(requests.get(url, stream=True).raw)
|
||||
prompt = (
|
||||
"user\n<image><image>\nWhat is the difference between these images?<|im_end|>\n<|im_start|>assistant\n"
|
||||
)
|
||||
images_nested = [[self.image, image]]
|
||||
inputs = self.processor(text=prompt, images=images_nested, return_tensors="pt").to(torch_device, torch.float16)
|
||||
|
||||
# verify generation
|
||||
output = model.generate(**inputs, max_new_tokens=40)
|
||||
EXPECTED_DECODED_TEXT = "user\n\nWhat is the difference between these images?\nassistant\nThe first image is a radar chart showing the performance of different models in a specific task, while the second image is a street scene with a stop sign in the foreground." # fmt: skip
|
||||
|
||||
self.assertEqual(
|
||||
self.processor.decode(output[0], skip_special_tokens=True),
|
||||
EXPECTED_DECODED_TEXT,
|
||||
)
|
||||
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test_multi_video(self):
|
||||
|
||||
@@ -233,7 +233,7 @@ class ImageProcessingTestMixin:
|
||||
avg_time = sum(sorted(all_times[:3])) / 3.0
|
||||
return avg_time
|
||||
|
||||
dummy_images = torch.randint(0, 255, (4, 3, 224, 224), dtype=torch.uint8)
|
||||
dummy_images = [torch.randint(0, 255, (3, 224, 224), dtype=torch.uint8) for _ in range(4)]
|
||||
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
|
||||
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user