Fix Qwen2VL processor to handle odd number of frames (#35431)
* fix: processing odd number of frames * feat: add test case * update: test one frame * feat: support custom patch size * fix: test with videos * revert: change on patch repeat * fix: much wow * update: fixups * fixup pls * ruff fixup * fix typo at least
This commit is contained in:
@@ -291,8 +291,9 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
|
||||
patches = np.array(processed_images)
|
||||
if data_format == ChannelDimension.LAST:
|
||||
patches = patches.transpose(0, 3, 1, 2)
|
||||
if patches.shape[0] == 1:
|
||||
patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
|
||||
if patches.shape[0] % self.temporal_patch_size != 0:
|
||||
repeats = np.repeat(patches[-1][np.newaxis], self.temporal_patch_size - 1, axis=0)
|
||||
patches = np.concatenate([patches, repeats], axis=0)
|
||||
channel = patches.shape[1]
|
||||
grid_t = patches.shape[0] // self.temporal_patch_size
|
||||
grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
|
||||
|
||||
@@ -22,7 +22,7 @@ from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
|
||||
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
|
||||
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs, prepare_video_inputs
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
@@ -40,6 +40,7 @@ class Qwen2VLImageProcessingTester:
|
||||
parent,
|
||||
batch_size=7,
|
||||
num_channels=3,
|
||||
num_frames=10,
|
||||
min_resolution=56,
|
||||
max_resolution=1024,
|
||||
min_pixels=56 * 56,
|
||||
@@ -58,6 +59,7 @@ class Qwen2VLImageProcessingTester:
|
||||
self.min_resolution = min_resolution
|
||||
self.max_resolution = max_resolution
|
||||
self.num_channels = num_channels
|
||||
self.num_frames = num_frames
|
||||
self.image_mean = OPENAI_CLIP_MEAN
|
||||
self.image_std = OPENAI_CLIP_STD
|
||||
self.min_pixels = min_pixels
|
||||
@@ -95,6 +97,18 @@ class Qwen2VLImageProcessingTester:
|
||||
)
|
||||
return [[image] for image in images]
|
||||
|
||||
def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False):
|
||||
return prepare_video_inputs(
|
||||
batch_size=self.batch_size,
|
||||
num_channels=self.num_channels,
|
||||
num_frames=self.num_frames,
|
||||
min_resolution=self.min_resolution,
|
||||
max_resolution=self.max_resolution,
|
||||
equal_resolution=equal_resolution,
|
||||
numpify=numpify,
|
||||
torchify=torchify,
|
||||
)
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
@@ -247,3 +261,26 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
# Image processor should return same pixel values, independently of ipnut format
|
||||
self.assertTrue((encoded_images_nested == encoded_images).all())
|
||||
self.assertTrue((image_grid_thws_nested == expected_image_grid_thws).all())
|
||||
|
||||
def test_video_inputs(self):
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
expected_dims_by_frames = {1: 34300, 2: 34300, 3: 68600, 4: 68600, 5: 102900, 6: 102900}
|
||||
|
||||
for num_frames, expected_dims in expected_dims_by_frames.items():
|
||||
image_processor_tester = Qwen2VLImageProcessingTester(self, num_frames=num_frames)
|
||||
video_inputs = image_processor_tester.prepare_video_inputs(equal_resolution=True)
|
||||
prcocess_out = image_processing(None, videos=video_inputs, return_tensors="pt")
|
||||
encoded_video = prcocess_out.pixel_values_videos
|
||||
expected_output_video_shape = (expected_dims, 1176)
|
||||
self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape)
|
||||
|
||||
def test_custom_patch_size(self):
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
|
||||
for patch_size in (1, 3, 5, 7):
|
||||
image_processor_tester = Qwen2VLImageProcessingTester(self, patch_size=patch_size)
|
||||
video_inputs = image_processor_tester.prepare_video_inputs(equal_resolution=True)
|
||||
prcocess_out = image_processing(None, videos=video_inputs, return_tensors="pt")
|
||||
encoded_video = prcocess_out.pixel_values_videos
|
||||
expected_output_video_shape = (171500, 1176)
|
||||
self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape)
|
||||
|
||||
@@ -253,7 +253,7 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
|
||||
"""
|
||||
Tests that VLMs through an error with explicit message saying what is wrong
|
||||
when number of images don't match number of image tokens in the text.
|
||||
Also we need to test multi-image cases when one prompr has multiple image tokens.
|
||||
Also we need to test multi-image cases when one prompt has multiple image tokens.
|
||||
"""
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
for model_class in self.all_model_classes:
|
||||
|
||||
@@ -125,7 +125,7 @@ def prepare_video_inputs(
|
||||
assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
|
||||
|
||||
video_inputs = []
|
||||
for i in range(batch_size):
|
||||
for _ in range(batch_size):
|
||||
if equal_resolution:
|
||||
width = height = max_resolution
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user