From 3c1895aa65a8ee998cb5784b06603d84d807f0f7 Mon Sep 17 00:00:00 2001 From: Jacky Lee <39754370+jla524@users.noreply.github.com> Date: Wed, 8 Jan 2025 04:49:00 -0800 Subject: [PATCH] Fix Qwen2VL processor to handle odd number of frames (#35431) * fix: processing odd number of frames * feat: add test case * update: test one frame * feat: support custom patch size * fix: test with videos * revert: change on patch repeat * fix: much wow * update: fixups * fixup pls * ruff fixup * fix typo at least --- .../qwen2_vl/image_processing_qwen2_vl.py | 5 ++- .../test_image_processing_qwen2_vl.py | 39 ++++++++++++++++++- .../models/qwen2_vl/test_modeling_qwen2_vl.py | 2 +- tests/test_image_processing_common.py | 18 ++++----- 4 files changed, 51 insertions(+), 13 deletions(-) diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py index 407034ef18..b8656a9103 100644 --- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py @@ -291,8 +291,9 @@ class Qwen2VLImageProcessor(BaseImageProcessor): patches = np.array(processed_images) if data_format == ChannelDimension.LAST: patches = patches.transpose(0, 3, 1, 2) - if patches.shape[0] == 1: - patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1)) + if patches.shape[0] % self.temporal_patch_size != 0: + repeats = np.repeat(patches[-1][np.newaxis], self.temporal_patch_size - 1, axis=0) + patches = np.concatenate([patches, repeats], axis=0) channel = patches.shape[1] grid_t = patches.shape[0] // self.temporal_patch_size grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size diff --git a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py index a6004349b4..76220dc66e 100644 --- a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py @@ -22,7 +22,7 @@ from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize from transformers.testing_utils import require_torch, require_vision from transformers.utils import is_torch_available, is_vision_available -from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs, prepare_video_inputs if is_torch_available(): @@ -40,6 +40,7 @@ class Qwen2VLImageProcessingTester: parent, batch_size=7, num_channels=3, + num_frames=10, min_resolution=56, max_resolution=1024, min_pixels=56 * 56, @@ -58,6 +59,7 @@ class Qwen2VLImageProcessingTester: self.min_resolution = min_resolution self.max_resolution = max_resolution self.num_channels = num_channels + self.num_frames = num_frames self.image_mean = OPENAI_CLIP_MEAN self.image_std = OPENAI_CLIP_STD self.min_pixels = min_pixels @@ -95,6 +97,18 @@ class Qwen2VLImageProcessingTester: ) return [[image] for image in images] + def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_video_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + num_frames=self.num_frames, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + @require_torch @require_vision @@ -247,3 +261,26 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): # Image processor should return same pixel values, independently of ipnut format self.assertTrue((encoded_images_nested == encoded_images).all()) self.assertTrue((image_grid_thws_nested == expected_image_grid_thws).all()) + + def test_video_inputs(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + expected_dims_by_frames = {1: 34300, 2: 34300, 3: 68600, 4: 68600, 5: 102900, 6: 102900} + + for num_frames, expected_dims in expected_dims_by_frames.items(): + image_processor_tester = Qwen2VLImageProcessingTester(self, num_frames=num_frames) + video_inputs = image_processor_tester.prepare_video_inputs(equal_resolution=True) + prcocess_out = image_processing(None, videos=video_inputs, return_tensors="pt") + encoded_video = prcocess_out.pixel_values_videos + expected_output_video_shape = (expected_dims, 1176) + self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape) + + def test_custom_patch_size(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + + for patch_size in (1, 3, 5, 7): + image_processor_tester = Qwen2VLImageProcessingTester(self, patch_size=patch_size) + video_inputs = image_processor_tester.prepare_video_inputs(equal_resolution=True) + prcocess_out = image_processing(None, videos=video_inputs, return_tensors="pt") + encoded_video = prcocess_out.pixel_values_videos + expected_output_video_shape = (171500, 1176) + self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape) diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index 2c27e1a03a..aedd379926 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -253,7 +253,7 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas """ Tests that VLMs through an error with explicit message saying what is wrong when number of images don't match number of image tokens in the text. - Also we need to test multi-image cases when one prompr has multiple image tokens. + Also we need to test multi-image cases when one prompt has multiple image tokens. """ config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: diff --git a/tests/test_image_processing_common.py b/tests/test_image_processing_common.py index 1cb92174df..971462f9e3 100644 --- a/tests/test_image_processing_common.py +++ b/tests/test_image_processing_common.py @@ -125,19 +125,19 @@ def prepare_video_inputs( assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time" video_inputs = [] - for i in range(batch_size): + for _ in range(batch_size): if equal_resolution: width = height = max_resolution else: width, height = np.random.choice(np.arange(min_resolution, max_resolution), 2) - video = prepare_video( - num_frames=num_frames, - num_channels=num_channels, - width=width, - height=height, - numpify=numpify, - torchify=torchify, - ) + video = prepare_video( + num_frames=num_frames, + num_channels=num_channels, + width=width, + height=height, + numpify=numpify, + torchify=torchify, + ) video_inputs.append(video) return video_inputs