[llava] one pixel is missing from padding when length is odd (#37819)

* [fix] one pixel should be added when length is odd

* [fix] add vision_aspect_ratio args & typo

* [fix] style

* [fix] do not fix fast file directly

* [fix] convert using modular

* remove duplicate codes

* match unpad logic with pad logic

* test odd-sized images for llava & aria

* test unpad odd-sized padding for llava family

* fix style

* add kwarg to onvision modular

* move vision_aspect_ratio from image_processor to processor
(llava_onevision)
This commit is contained in:
youngrok cha
2025-05-06 20:11:26 +09:00
committed by GitHub
parent 9981214d32
commit acded47fe7
17 changed files with 234 additions and 133 deletions

View File

@@ -17,7 +17,7 @@ import unittest
import numpy as np
from transformers.image_utils import PILImageResampling
from transformers.image_utils import ChannelDimension, PILImageResampling
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_vision_available
@@ -264,3 +264,41 @@ class AriaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
tuple(encoded_images.shape),
(self.image_processor_tester.batch_size, *expected_output_image_shape),
)
def test_pad_for_patching(self):
for image_processing_class in self.image_processor_list:
if image_processing_class == self.fast_image_processing_class:
numpify = False
torchify = True
input_data_format = image_processing_class.data_format
else:
numpify = True
torchify = False
input_data_format = ChannelDimension.LAST
image_processing = image_processing_class(**self.image_processor_dict)
# Create odd-sized images
image_input = self.image_processor_tester.prepare_image_inputs(
batch_size=1,
max_resolution=400,
num_images=1,
equal_resolution=True,
numpify=numpify,
torchify=torchify,
)[0][0]
self.assertIn(image_input.shape, [(3, 400, 400), (400, 400, 3)])
# Test odd-width
image_shape = (400, 601)
encoded_images = image_processing._pad_for_patching(image_input, image_shape, input_data_format)
encoded_image_shape = (
encoded_images.shape[:-1] if input_data_format == ChannelDimension.LAST else encoded_images.shape[1:]
)
self.assertEqual(encoded_image_shape, image_shape)
# Test odd-height
image_shape = (503, 400)
encoded_images = image_processing._pad_for_patching(image_input, image_shape, input_data_format)
encoded_image_shape = (
encoded_images.shape[:-1] if input_data_format == ChannelDimension.LAST else encoded_images.shape[1:]
)
self.assertEqual(encoded_image_shape, image_shape)