[video processors] support frame sampling within processors (#38105)

* apply updates smolVLM (still needs workaround for chat template)

* add other models

* dump qwen omni for now, come back later

* port qwen omni from their impl

* wait, all qwens sample videos in same way!

* clean up

* make smolvlm backwards compatible and fix padding

* dix some tests

* fox smolvlm tests

* more clean up and test fixing

* delete unused arg

* fix

* address comments

* style

* fix test
This commit is contained in:
Raushan Turganbay
2025-06-12 11:34:30 +02:00
committed by GitHub
parent 887054c714
commit 27459025b8
25 changed files with 864 additions and 795 deletions

View File

@@ -293,6 +293,59 @@ class VideoProcessingTestMixin:
(self.video_processor_tester.batch_size, *expected_output_video_shape),
)
def test_call_sample_frames(self):
for video_processing_class in self.video_processor_list:
video_processing = video_processing_class(**self.video_processor_dict)
prev_num_frames = self.video_processor_tester.num_frames
self.video_processor_tester.num_frames = 8
video_inputs = self.video_processor_tester.prepare_video_inputs(
equal_resolution=False,
return_tensors="torch",
)
# Force set sampling to False. No sampling is expected even when `num_frames` exists
video_processing.do_sample_frames = False
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=3)[self.input_name]
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=3)[self.input_name]
self.assertEqual(encoded_videos.shape[1], 8)
self.assertEqual(encoded_videos_batched.shape[1], 8)
# Set sampling to True. Video frames should be sampled with `num_frames` in the output
video_processing.do_sample_frames = True
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=3)[self.input_name]
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=3)[self.input_name]
self.assertEqual(encoded_videos.shape[1], 3)
self.assertEqual(encoded_videos_batched.shape[1], 3)
# Sample with `fps` requires metadata to infer number of frames from total duration
with self.assertRaises(ValueError):
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=3)[self.input_name]
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", fps=3)[self.input_name]
metadata = [[{"duration": 2.0, "total_num_frames": 8, "fps": 4}]]
batched_metadata = metadata * len(video_inputs)
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=3, video_metadata=metadata)[
self.input_name
]
encoded_videos_batched = video_processing(
video_inputs, return_tensors="pt", fps=3, video_metadata=batched_metadata
)[self.input_name]
self.assertEqual(encoded_videos.shape[1], 6)
self.assertEqual(encoded_videos_batched.shape[1], 6)
# We should raise error when asked to sample more frames than there are in input video
with self.assertRaises(ValueError):
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=10)[self.input_name]
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=10)[
self.input_name
]
# Assign back the actual num frames in tester
self.video_processor_tester.num_frames = prev_num_frames
def test_nested_input(self):
"""Tests that the processor can work with nested list where each video is a list of arrays"""
for video_processing_class in self.video_processor_list: