[llava] one pixel is missing from padding when length is odd (#37819)

* [fix] one pixel should be added when length is odd

* [fix] add vision_aspect_ratio args & typo

* [fix] style

* [fix] do not fix fast file directly

* [fix] convert using modular

* remove duplicate codes

* match unpad logic with pad logic

* test odd-sized images for llava & aria

* test unpad odd-sized padding for llava family

* fix style

* add kwarg to onvision modular

* move vision_aspect_ratio from image_processor to processor
(llava_onevision)
This commit is contained in:
youngrok cha
2025-05-06 20:11:26 +09:00
committed by GitHub
parent 9981214d32
commit acded47fe7
17 changed files with 234 additions and 133 deletions

View File

@@ -16,7 +16,7 @@ import unittest
import numpy as np
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ChannelDimension
from transformers.models.llava_next.image_processing_llava_next import select_best_resolution
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
@@ -230,3 +230,38 @@ class LlavaNextImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
# Image processor should return same pixel values, independently of ipnut format
self.assertTrue((encoded_images_nested == encoded_images).all())
def test_pad_for_patching(self):
for image_processing_class in self.image_processor_list:
if image_processing_class == self.fast_image_processing_class:
numpify = False
torchify = True
input_data_format = image_processing_class.data_format
else:
numpify = True
torchify = False
input_data_format = ChannelDimension.LAST
image_processing = image_processing_class(**self.image_processor_dict)
# Create odd-sized images
image_input = self.image_processor_tester.prepare_image_inputs(
equal_resolution=True,
numpify=numpify,
torchify=torchify,
)[0]
self.assertIn(image_input.shape, [(3, 400, 400), (400, 400, 3)])
# Test odd-width
image_shape = (400, 601)
encoded_images = image_processing._pad_for_patching(image_input, image_shape, input_data_format)
encoded_image_shape = (
encoded_images.shape[:-1] if input_data_format == ChannelDimension.LAST else encoded_images.shape[1:]
)
self.assertEqual(encoded_image_shape, image_shape)
# Test odd-height
image_shape = (503, 400)
encoded_images = image_processing._pad_for_patching(image_input, image_shape, input_data_format)
encoded_image_shape = (
encoded_images.shape[:-1] if input_data_format == ChannelDimension.LAST else encoded_images.shape[1:]
)
self.assertEqual(encoded_image_shape, image_shape)

View File

@@ -48,7 +48,7 @@ from ...test_modeling_common import (
if is_torch_available():
import torch
from transformers.models.llava_next.modeling_llava_next import image_size_to_num_patches
from transformers.models.llava_next.modeling_llava_next import image_size_to_num_patches, unpad_image
if is_vision_available():
@@ -288,6 +288,19 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
image_sizes = torch.cat([image_sizes, image_sizes], dim=0)
_ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes)
def test_unpad_image(self):
original_size = (400, 400)
# Test case width is padded
pixel_values = floats_tensor([3, 400, 601])
unpadded_tensor = unpad_image(pixel_values, original_size)
self.assertEqual(unpadded_tensor.shape[1:], original_size)
# Test case height is padded
pixel_values = floats_tensor([3, 503, 400])
unpadded_tensor = unpad_image(pixel_values, original_size)
self.assertEqual(unpadded_tensor.shape[1:], original_size)
@parameterized.expand(
[
(-1,),