Fix Qwen2VL processor to handle odd number of frames (#35431)

* fix: processing odd number of frames * feat: add test case * update: test one frame * feat: support custom patch size * fix: test with videos * revert: change on patch repeat * fix: much wow * update: fixups * fixup pls * ruff fixup * fix typo at least
2025-01-08 04:49:00 -08:00
parent 3fde88b19d
commit 3c1895aa65
4 changed files with 51 additions and 13 deletions
--- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
@@ -291,8 +291,9 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
        patches = np.array(processed_images)
        if data_format == ChannelDimension.LAST:
            patches = patches.transpose(0, 3, 1, 2)
-        if patches.shape[0] == 1:
+        if patches.shape[0] % self.temporal_patch_size != 0:
-            patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
+            repeats = np.repeat(patches[-1][np.newaxis], self.temporal_patch_size - 1, axis=0)
            patches = np.concatenate([patches, repeats], axis=0)
        channel = patches.shape[1]
        grid_t = patches.shape[0] // self.temporal_patch_size
        grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
--- a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
@@ -22,7 +22,7 @@ from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs, prepare_video_inputs
 if is_torch_available():
@@ -40,6 +40,7 @@ class Qwen2VLImageProcessingTester:
        parent,
        batch_size=7,
        num_channels=3,
        num_frames=10,
        min_resolution=56,
        max_resolution=1024,
        min_pixels=56 * 56,
@@ -58,6 +59,7 @@ class Qwen2VLImageProcessingTester:
        self.min_resolution = min_resolution
        self.max_resolution = max_resolution
        self.num_channels = num_channels
        self.num_frames = num_frames
        self.image_mean = OPENAI_CLIP_MEAN
        self.image_std = OPENAI_CLIP_STD
        self.min_pixels = min_pixels
@@ -95,6 +97,18 @@ class Qwen2VLImageProcessingTester:
        )
        return [[image] for image in images]
    def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False):
        return prepare_video_inputs(
            batch_size=self.batch_size,
            num_channels=self.num_channels,
            num_frames=self.num_frames,
            min_resolution=self.min_resolution,
            max_resolution=self.max_resolution,
            equal_resolution=equal_resolution,
            numpify=numpify,
            torchify=torchify,
        )
@require_torch
@require_vision
@@ -247,3 +261,26 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        # Image processor should return same pixel values, independently of ipnut format
        self.assertTrue((encoded_images_nested == encoded_images).all())
        self.assertTrue((image_grid_thws_nested == expected_image_grid_thws).all())
    def test_video_inputs(self):
        image_processing = self.image_processing_class(**self.image_processor_dict)
        expected_dims_by_frames = {1: 34300, 2: 34300, 3: 68600, 4: 68600, 5: 102900, 6: 102900}
        for num_frames, expected_dims in expected_dims_by_frames.items():
            image_processor_tester = Qwen2VLImageProcessingTester(self, num_frames=num_frames)
            video_inputs = image_processor_tester.prepare_video_inputs(equal_resolution=True)
            prcocess_out = image_processing(None, videos=video_inputs, return_tensors="pt")
            encoded_video = prcocess_out.pixel_values_videos
            expected_output_video_shape = (expected_dims, 1176)
            self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape)
    def test_custom_patch_size(self):
        image_processing = self.image_processing_class(**self.image_processor_dict)
        for patch_size in (1, 3, 5, 7):
            image_processor_tester = Qwen2VLImageProcessingTester(self, patch_size=patch_size)
            video_inputs = image_processor_tester.prepare_video_inputs(equal_resolution=True)
            prcocess_out = image_processing(None, videos=video_inputs, return_tensors="pt")
            encoded_video = prcocess_out.pixel_values_videos
            expected_output_video_shape = (171500, 1176)
            self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape)
--- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
@@ -253,7 +253,7 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
        """
        Tests that VLMs through an error with explicit message saying what is wrong
        when number of images don't match number of image tokens in the text.
-        Also we need to test multi-image cases when one prompr has multiple image tokens.
+        Also we need to test multi-image cases when one prompt has multiple image tokens.
        """
        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
        for model_class in self.all_model_classes:
--- a/tests/test_image_processing_common.py
+++ b/tests/test_image_processing_common.py
@@ -125,19 +125,19 @@ def prepare_video_inputs(
    assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
    video_inputs = []
-    for i in range(batch_size):
+    for _ in range(batch_size):
        if equal_resolution:
            width = height = max_resolution
        else:
            width, height = np.random.choice(np.arange(min_resolution, max_resolution), 2)
-            video = prepare_video(
+        video = prepare_video(
-                num_frames=num_frames,
+            num_frames=num_frames,
-                num_channels=num_channels,
+            num_channels=num_channels,
-                width=width,
+            width=width,
-                height=height,
+            height=height,
-                numpify=numpify,
+            numpify=numpify,
-                torchify=torchify,
+            torchify=torchify,
-            )
+        )
        video_inputs.append(video)
    return video_inputs