[llava] one pixel is missing from padding when length is odd (#37819)
* [fix] one pixel should be added when length is odd * [fix] add vision_aspect_ratio args & typo * [fix] style * [fix] do not fix fast file directly * [fix] convert using modular * remove duplicate codes * match unpad logic with pad logic * test odd-sized images for llava & aria * test unpad odd-sized padding for llava family * fix style * add kwarg to onvision modular * move vision_aspect_ratio from image_processor to processor (llava_onevision)
This commit is contained in:
@@ -17,7 +17,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from transformers.image_utils import PILImageResampling
|
||||
from transformers.image_utils import ChannelDimension, PILImageResampling
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
|
||||
@@ -264,3 +264,41 @@ class AriaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
tuple(encoded_images.shape),
|
||||
(self.image_processor_tester.batch_size, *expected_output_image_shape),
|
||||
)
|
||||
|
||||
def test_pad_for_patching(self):
|
||||
for image_processing_class in self.image_processor_list:
|
||||
if image_processing_class == self.fast_image_processing_class:
|
||||
numpify = False
|
||||
torchify = True
|
||||
input_data_format = image_processing_class.data_format
|
||||
else:
|
||||
numpify = True
|
||||
torchify = False
|
||||
input_data_format = ChannelDimension.LAST
|
||||
image_processing = image_processing_class(**self.image_processor_dict)
|
||||
# Create odd-sized images
|
||||
image_input = self.image_processor_tester.prepare_image_inputs(
|
||||
batch_size=1,
|
||||
max_resolution=400,
|
||||
num_images=1,
|
||||
equal_resolution=True,
|
||||
numpify=numpify,
|
||||
torchify=torchify,
|
||||
)[0][0]
|
||||
self.assertIn(image_input.shape, [(3, 400, 400), (400, 400, 3)])
|
||||
|
||||
# Test odd-width
|
||||
image_shape = (400, 601)
|
||||
encoded_images = image_processing._pad_for_patching(image_input, image_shape, input_data_format)
|
||||
encoded_image_shape = (
|
||||
encoded_images.shape[:-1] if input_data_format == ChannelDimension.LAST else encoded_images.shape[1:]
|
||||
)
|
||||
self.assertEqual(encoded_image_shape, image_shape)
|
||||
|
||||
# Test odd-height
|
||||
image_shape = (503, 400)
|
||||
encoded_images = image_processing._pad_for_patching(image_input, image_shape, input_data_format)
|
||||
encoded_image_shape = (
|
||||
encoded_images.shape[:-1] if input_data_format == ChannelDimension.LAST else encoded_images.shape[1:]
|
||||
)
|
||||
self.assertEqual(encoded_image_shape, image_shape)
|
||||
|
||||
@@ -16,7 +16,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
|
||||
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ChannelDimension
|
||||
from transformers.models.llava_next.image_processing_llava_next import select_best_resolution
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
||||
@@ -230,3 +230,38 @@ class LlavaNextImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
|
||||
# Image processor should return same pixel values, independently of ipnut format
|
||||
self.assertTrue((encoded_images_nested == encoded_images).all())
|
||||
|
||||
def test_pad_for_patching(self):
|
||||
for image_processing_class in self.image_processor_list:
|
||||
if image_processing_class == self.fast_image_processing_class:
|
||||
numpify = False
|
||||
torchify = True
|
||||
input_data_format = image_processing_class.data_format
|
||||
else:
|
||||
numpify = True
|
||||
torchify = False
|
||||
input_data_format = ChannelDimension.LAST
|
||||
image_processing = image_processing_class(**self.image_processor_dict)
|
||||
# Create odd-sized images
|
||||
image_input = self.image_processor_tester.prepare_image_inputs(
|
||||
equal_resolution=True,
|
||||
numpify=numpify,
|
||||
torchify=torchify,
|
||||
)[0]
|
||||
self.assertIn(image_input.shape, [(3, 400, 400), (400, 400, 3)])
|
||||
|
||||
# Test odd-width
|
||||
image_shape = (400, 601)
|
||||
encoded_images = image_processing._pad_for_patching(image_input, image_shape, input_data_format)
|
||||
encoded_image_shape = (
|
||||
encoded_images.shape[:-1] if input_data_format == ChannelDimension.LAST else encoded_images.shape[1:]
|
||||
)
|
||||
self.assertEqual(encoded_image_shape, image_shape)
|
||||
|
||||
# Test odd-height
|
||||
image_shape = (503, 400)
|
||||
encoded_images = image_processing._pad_for_patching(image_input, image_shape, input_data_format)
|
||||
encoded_image_shape = (
|
||||
encoded_images.shape[:-1] if input_data_format == ChannelDimension.LAST else encoded_images.shape[1:]
|
||||
)
|
||||
self.assertEqual(encoded_image_shape, image_shape)
|
||||
|
||||
@@ -48,7 +48,7 @@ from ...test_modeling_common import (
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
from transformers.models.llava_next.modeling_llava_next import image_size_to_num_patches
|
||||
from transformers.models.llava_next.modeling_llava_next import image_size_to_num_patches, unpad_image
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
@@ -288,6 +288,19 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
|
||||
image_sizes = torch.cat([image_sizes, image_sizes], dim=0)
|
||||
_ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes)
|
||||
|
||||
def test_unpad_image(self):
|
||||
original_size = (400, 400)
|
||||
|
||||
# Test case width is padded
|
||||
pixel_values = floats_tensor([3, 400, 601])
|
||||
unpadded_tensor = unpad_image(pixel_values, original_size)
|
||||
self.assertEqual(unpadded_tensor.shape[1:], original_size)
|
||||
|
||||
# Test case height is padded
|
||||
pixel_values = floats_tensor([3, 503, 400])
|
||||
unpadded_tensor = unpad_image(pixel_values, original_size)
|
||||
self.assertEqual(unpadded_tensor.shape[1:], original_size)
|
||||
|
||||
@parameterized.expand(
|
||||
[
|
||||
(-1,),
|
||||
|
||||
@@ -47,6 +47,8 @@ from ...test_modeling_common import (
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
from transformers.models.llava_next_video.modeling_llava_next_video import unpad_image
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
@@ -302,6 +304,19 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati
|
||||
image_sizes = torch.cat([image_sizes, image_sizes], dim=0)
|
||||
_ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes)
|
||||
|
||||
def test_unpad_image(self):
|
||||
original_size = (400, 400)
|
||||
|
||||
# Test case width is padded
|
||||
pixel_values = floats_tensor([3, 400, 601])
|
||||
unpadded_tensor = unpad_image(pixel_values, original_size)
|
||||
self.assertEqual(unpadded_tensor.shape[1:], original_size)
|
||||
|
||||
# Test case height is padded
|
||||
pixel_values = floats_tensor([3, 503, 400])
|
||||
unpadded_tensor = unpad_image(pixel_values, original_size)
|
||||
self.assertEqual(unpadded_tensor.shape[1:], original_size)
|
||||
|
||||
@parameterized.expand(
|
||||
[
|
||||
(-1,),
|
||||
|
||||
@@ -16,7 +16,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
|
||||
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ChannelDimension
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
||||
|
||||
@@ -305,3 +305,38 @@ class LlavaOnevisionImageProcessingTest(ImageProcessingTestMixin, unittest.TestC
|
||||
) # FIXME yoni
|
||||
def test_can_compile_fast_image_processor(self):
|
||||
pass
|
||||
|
||||
def test_pad_for_patching(self):
|
||||
for image_processing_class in self.image_processor_list:
|
||||
if image_processing_class == self.fast_image_processing_class:
|
||||
numpify = False
|
||||
torchify = True
|
||||
input_data_format = image_processing_class.data_format
|
||||
else:
|
||||
numpify = True
|
||||
torchify = False
|
||||
input_data_format = ChannelDimension.LAST
|
||||
image_processing = image_processing_class(**self.image_processor_dict)
|
||||
# Create odd-sized images
|
||||
image_input = self.image_processor_tester.prepare_image_inputs(
|
||||
equal_resolution=True,
|
||||
numpify=numpify,
|
||||
torchify=torchify,
|
||||
)[0]
|
||||
self.assertIn(image_input.shape, [(3, 400, 400), (400, 400, 3)])
|
||||
|
||||
# Test odd-width
|
||||
image_shape = (400, 601)
|
||||
encoded_images = image_processing._pad_for_patching(image_input, image_shape, input_data_format)
|
||||
encoded_image_shape = (
|
||||
encoded_images.shape[:-1] if input_data_format == ChannelDimension.LAST else encoded_images.shape[1:]
|
||||
)
|
||||
self.assertEqual(encoded_image_shape, image_shape)
|
||||
|
||||
# Test odd-height
|
||||
image_shape = (503, 400)
|
||||
encoded_images = image_processing._pad_for_patching(image_input, image_shape, input_data_format)
|
||||
encoded_image_shape = (
|
||||
encoded_images.shape[:-1] if input_data_format == ChannelDimension.LAST else encoded_images.shape[1:]
|
||||
)
|
||||
self.assertEqual(encoded_image_shape, image_shape)
|
||||
|
||||
@@ -48,6 +48,8 @@ from ...test_modeling_common import (
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
from transformers.models.llava_onevision.modeling_llava_onevision import unpad_image
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
@@ -258,6 +260,19 @@ class LlavaOnevisionForConditionalGenerationModelTest(ModelTesterMixin, Generati
|
||||
out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
|
||||
torch.testing.assert_close(out_embeds, out_ids)
|
||||
|
||||
def test_unpad_image(self):
|
||||
original_size = (400, 400)
|
||||
|
||||
# Test case width is padded
|
||||
pixel_values = floats_tensor([3, 400, 601])
|
||||
unpadded_tensor = unpad_image(pixel_values, original_size)
|
||||
self.assertEqual(unpadded_tensor.shape[1:], original_size)
|
||||
|
||||
# Test case height is padded
|
||||
pixel_values = floats_tensor([3, 503, 400])
|
||||
unpadded_tensor = unpad_image(pixel_values, original_size)
|
||||
self.assertEqual(unpadded_tensor.shape[1:], original_size)
|
||||
|
||||
@parameterized.expand(
|
||||
[
|
||||
(-1,),
|
||||
|
||||
Reference in New Issue
Block a user