diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py index 4eb3ce022e..b85085476c 100644 --- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py @@ -274,7 +274,9 @@ class Qwen2VLImageProcessor(BaseImageProcessor): if data_format == ChannelDimension.LAST: patches = patches.transpose(0, 3, 1, 2) if patches.shape[0] % temporal_patch_size != 0: - repeats = np.repeat(patches[-1][np.newaxis], temporal_patch_size - 1, axis=0) + repeats = np.repeat( + patches[-1][np.newaxis], temporal_patch_size - (patches.shape[0] % temporal_patch_size), axis=0 + ) patches = np.concatenate([patches, repeats], axis=0) channel = patches.shape[1] grid_t = patches.shape[0] // temporal_patch_size diff --git a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py index bae6e011a6..5e600338b3 100644 --- a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py @@ -312,6 +312,24 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): expected_output_video_shape = [112, 1176] self.assertListEqual(list(prcocess_out.pixel_values.shape), expected_output_video_shape) + def test_temporal_padding(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = image_processing_class(**self.image_processor_dict) + # Create random video inputs with a number of frames not divisible by temporal_patch_size + image_processor_tester = Qwen2VLImageProcessingTester(self, num_frames=5, temporal_patch_size=4) + video_inputs = image_processor_tester.prepare_video_inputs(equal_resolution=True) + + # Process the video inputs + process_out = image_processing(None, videos=video_inputs, return_tensors="pt") + encoded_video = process_out.pixel_values_videos + + # Check the shape after padding + expected_output_video_shape = (102900, 1176) # Adjusted based on padding + self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape) + # Check divisibility by temporal_patch_size + self.assertEqual(encoded_video.shape[0] % 4, 0) + @require_vision @require_torch def test_slow_fast_equivalence(self):