From 63c6331387d70b8669f0d519a2db39be45e10bf2 Mon Sep 17 00:00:00 2001 From: Pedro Cuenca Date: Wed, 23 Apr 2025 17:08:11 +0200 Subject: [PATCH] Qwen 2.5 Omni: apply video defaults (#37660) * Apply video defaults for min_pixels and max_pixels * fps kwarg should not be a list * Update test to account for new resizing --- .../models/qwen2_5_omni/processing_qwen2_5_omni.py | 7 ++++--- tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py | 8 ++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py index 64b444b716..57b2d43a3f 100644 --- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py @@ -61,6 +61,8 @@ class Qwen2_5OmniProcessorKwargs(ProcessingKwargs, total=False): "seconds_per_chunk": 2.0, "position_id_per_seconds": 25, "use_audio_in_video": False, + "min_pixels": 128 * 28 * 28, + "max_pixels": 768 * 28 * 28, }, "audio_kwargs": { "sampling_rate": 16000, @@ -147,7 +149,7 @@ class Qwen2_5OmniProcessor(ProcessorMixin): seconds_per_chunk = output_kwargs["videos_kwargs"].pop("seconds_per_chunk") position_id_per_seconds = output_kwargs["videos_kwargs"].pop("position_id_per_seconds") use_audio_in_video = output_kwargs["videos_kwargs"].pop("use_audio_in_video") - fps = output_kwargs["videos_kwargs"].pop("fps", None) + fps = output_kwargs["videos_kwargs"].pop("fps", 2.0) if audio is not None: output_kwargs["audio_kwargs"]["padding"] = "max_length" # Support "max_length" padding only here @@ -174,8 +176,7 @@ class Qwen2_5OmniProcessor(ProcessorMixin): if videos is not None: videos = make_batched_videos(videos) videos_inputs = self.image_processor(images=None, videos=videos, **output_kwargs["videos_kwargs"]) - if fps is None: - fps = [2.0] * len(videos) + fps = [fps] * len(videos) videos_inputs["video_second_per_grid"] = [ self.image_processor.temporal_patch_size / fps[i] for i in range(len(fps)) ] diff --git a/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py b/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py index 411063480d..60b4622968 100644 --- a/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py +++ b/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py @@ -433,7 +433,7 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase): num_frames=num_frames, ) self.assertTrue(self.videos_input_name in out_dict_with_video) - self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 9568) + self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 5760) # Load with `video_fps` arg video_fps = 1 @@ -445,7 +445,7 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase): video_fps=video_fps, ) self.assertTrue(self.videos_input_name in out_dict_with_video) - self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 23920) + self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 14400) # Load with `video_fps` and `num_frames` args, should raise an error with self.assertRaises(ValueError): @@ -466,7 +466,7 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase): return_dict=True, ) self.assertTrue(self.videos_input_name in out_dict_with_video) - self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 717600) + self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 432000) # Load video as a list of frames (i.e. images). NOTE: each frame should have same size # because we assume they come from one video @@ -484,7 +484,7 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase): return_dict=True, ) self.assertTrue(self.videos_input_name in out_dict_with_video) - self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 5704) + self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 2904) @require_av def test_apply_chat_template_video_special_processing(self):