[llava] one pixel is missing from padding when length is odd (#37819)

* [fix] one pixel should be added when length is odd * [fix] add vision_aspect_ratio args & typo * [fix] style * [fix] do not fix fast file directly * [fix] convert using modular * remove duplicate codes * match unpad logic with pad logic * test odd-sized images for llava & aria * test unpad odd-sized padding for llava family * fix style * add kwarg to onvision modular * move vision_aspect_ratio from image_processor to processor (llava_onevision)
2025-05-06 20:11:26 +09:00
parent 9981214d32
commit acded47fe7
17 changed files with 234 additions and 133 deletions
--- a/tests/models/aria/test_image_processing_aria.py
+++ b/tests/models/aria/test_image_processing_aria.py
@@ -17,7 +17,7 @@ import unittest

 import numpy as np

-from transformers.image_utils import PILImageResampling
+from transformers.image_utils import ChannelDimension, PILImageResampling
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available

@@ -264,3 +264,41 @@ class AriaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
                tuple(encoded_images.shape),
                (self.image_processor_tester.batch_size, *expected_output_image_shape),
            )
+
+    def test_pad_for_patching(self):
+        for image_processing_class in self.image_processor_list:
+            if image_processing_class == self.fast_image_processing_class:
+                numpify = False
+                torchify = True
+                input_data_format = image_processing_class.data_format
+            else:
+                numpify = True
+                torchify = False
+                input_data_format = ChannelDimension.LAST
+            image_processing = image_processing_class(**self.image_processor_dict)
+            # Create odd-sized images
+            image_input = self.image_processor_tester.prepare_image_inputs(
+                batch_size=1,
+                max_resolution=400,
+                num_images=1,
+                equal_resolution=True,
+                numpify=numpify,
+                torchify=torchify,
+            )[0][0]
+            self.assertIn(image_input.shape, [(3, 400, 400), (400, 400, 3)])
+
+            # Test odd-width
+            image_shape = (400, 601)
+            encoded_images = image_processing._pad_for_patching(image_input, image_shape, input_data_format)
+            encoded_image_shape = (
+                encoded_images.shape[:-1] if input_data_format == ChannelDimension.LAST else encoded_images.shape[1:]
+            )
+            self.assertEqual(encoded_image_shape, image_shape)
+
+            # Test odd-height
+            image_shape = (503, 400)
+            encoded_images = image_processing._pad_for_patching(image_input, image_shape, input_data_format)
+            encoded_image_shape = (
+                encoded_images.shape[:-1] if input_data_format == ChannelDimension.LAST else encoded_images.shape[1:]
+            )
+            self.assertEqual(encoded_image_shape, image_shape)