From 3c1895aa65a8ee998cb5784b06603d84d807f0f7 Mon Sep 17 00:00:00 2001
From: Jacky Lee <39754370+jla524@users.noreply.github.com>
Date: Wed, 8 Jan 2025 04:49:00 -0800
Subject: [PATCH] Fix Qwen2VL processor to handle odd number of frames (#35431)

* fix: processing odd number of frames

* feat: add test case

* update: test one frame

* feat: support custom patch size

* fix: test with videos

* revert: change on patch repeat

* fix: much wow

* update: fixups

* fixup pls

* ruff fixup

* fix typo at least
---
 .../qwen2_vl/image_processing_qwen2_vl.py     |  5 ++-
 .../test_image_processing_qwen2_vl.py         | 39 ++++++++++++++++++-
 .../models/qwen2_vl/test_modeling_qwen2_vl.py |  2 +-
 tests/test_image_processing_common.py         | 18 ++++-----
 4 files changed, 51 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
index 407034ef18..b8656a9103 100644
--- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
@@ -291,8 +291,9 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
         patches = np.array(processed_images)
         if data_format == ChannelDimension.LAST:
             patches = patches.transpose(0, 3, 1, 2)
-        if patches.shape[0] == 1:
-            patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
+        if patches.shape[0] % self.temporal_patch_size != 0:
+            repeats = np.repeat(patches[-1][np.newaxis], self.temporal_patch_size - 1, axis=0)
+            patches = np.concatenate([patches, repeats], axis=0)
         channel = patches.shape[1]
         grid_t = patches.shape[0] // self.temporal_patch_size
         grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
diff --git a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
index a6004349b4..76220dc66e 100644
--- a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
@@ -22,7 +22,7 @@ from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available
 
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs, prepare_video_inputs
 
 
 if is_torch_available():
@@ -40,6 +40,7 @@ class Qwen2VLImageProcessingTester:
         parent,
         batch_size=7,
         num_channels=3,
+        num_frames=10,
         min_resolution=56,
         max_resolution=1024,
         min_pixels=56 * 56,
@@ -58,6 +59,7 @@ class Qwen2VLImageProcessingTester:
         self.min_resolution = min_resolution
         self.max_resolution = max_resolution
         self.num_channels = num_channels
+        self.num_frames = num_frames
         self.image_mean = OPENAI_CLIP_MEAN
         self.image_std = OPENAI_CLIP_STD
         self.min_pixels = min_pixels
@@ -95,6 +97,18 @@ class Qwen2VLImageProcessingTester:
         )
         return [[image] for image in images]
 
+    def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_video_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            num_frames=self.num_frames,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
 
 @require_torch
 @require_vision
@@ -247,3 +261,26 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
         # Image processor should return same pixel values, independently of ipnut format
         self.assertTrue((encoded_images_nested == encoded_images).all())
         self.assertTrue((image_grid_thws_nested == expected_image_grid_thws).all())
+
+    def test_video_inputs(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        expected_dims_by_frames = {1: 34300, 2: 34300, 3: 68600, 4: 68600, 5: 102900, 6: 102900}
+
+        for num_frames, expected_dims in expected_dims_by_frames.items():
+            image_processor_tester = Qwen2VLImageProcessingTester(self, num_frames=num_frames)
+            video_inputs = image_processor_tester.prepare_video_inputs(equal_resolution=True)
+            prcocess_out = image_processing(None, videos=video_inputs, return_tensors="pt")
+            encoded_video = prcocess_out.pixel_values_videos
+            expected_output_video_shape = (expected_dims, 1176)
+            self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape)
+
+    def test_custom_patch_size(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+
+        for patch_size in (1, 3, 5, 7):
+            image_processor_tester = Qwen2VLImageProcessingTester(self, patch_size=patch_size)
+            video_inputs = image_processor_tester.prepare_video_inputs(equal_resolution=True)
+            prcocess_out = image_processing(None, videos=video_inputs, return_tensors="pt")
+            encoded_video = prcocess_out.pixel_values_videos
+            expected_output_video_shape = (171500, 1176)
+            self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape)
diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
index 2c27e1a03a..aedd379926 100644
--- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
@@ -253,7 +253,7 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
         """
         Tests that VLMs through an error with explicit message saying what is wrong
         when number of images don't match number of image tokens in the text.
-        Also we need to test multi-image cases when one prompr has multiple image tokens.
+        Also we need to test multi-image cases when one prompt has multiple image tokens.
         """
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
diff --git a/tests/test_image_processing_common.py b/tests/test_image_processing_common.py
index 1cb92174df..971462f9e3 100644
--- a/tests/test_image_processing_common.py
+++ b/tests/test_image_processing_common.py
@@ -125,19 +125,19 @@ def prepare_video_inputs(
     assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
 
     video_inputs = []
-    for i in range(batch_size):
+    for _ in range(batch_size):
         if equal_resolution:
             width = height = max_resolution
         else:
             width, height = np.random.choice(np.arange(min_resolution, max_resolution), 2)
-            video = prepare_video(
-                num_frames=num_frames,
-                num_channels=num_channels,
-                width=width,
-                height=height,
-                numpify=numpify,
-                torchify=torchify,
-            )
+        video = prepare_video(
+            num_frames=num_frames,
+            num_channels=num_channels,
+            width=width,
+            height=height,
+            numpify=numpify,
+            torchify=torchify,
+        )
         video_inputs.append(video)
 
     return video_inputs