[video processors] support frame sampling within processors (#38105)

* apply updates smolVLM (still needs workaround for chat template)

* add other models

* dump qwen omni for now, come back later

* port qwen omni from their impl

* wait, all qwens sample videos in same way!

* clean up

* make smolvlm backwards compatible and fix padding

* dix some tests

* fox smolvlm tests

* more clean up and test fixing

* delete unused arg

* fix

* address comments

* style

* fix test
This commit is contained in:
Raushan Turganbay
2025-06-12 11:34:30 +02:00
committed by GitHub
parent 887054c714
commit 27459025b8
25 changed files with 864 additions and 795 deletions

View File

@@ -17,7 +17,6 @@ import shutil
import tempfile
import unittest
from huggingface_hub import hf_hub_download
from parameterized import parameterized
from transformers import AutoProcessor, AutoTokenizer, InternVLProcessor
@@ -180,77 +179,6 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
)
images_patches_index += inputs["pixel_values"].shape[0]
# Override video chat_template tests as InternVLProcessor returns flattened video features
@require_av
@require_torch
def test_apply_chat_template_video_special_processing(self):
"""
Tests that models can use their own preprocessing to preprocess conversations.
"""
processor = self.get_processor()
if processor.chat_template is None:
self.skipTest("Processor has no chat template")
signature = inspect.signature(processor.__call__)
if "videos" not in {*signature.parameters.keys()} or (
signature.parameters.get("videos") is not None
and signature.parameters["videos"].annotation == inspect._empty
):
self.skipTest("Processor doesn't accept videos at input")
video_file_path = hf_hub_download(
repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
)
messages = [
[
{
"role": "user",
"content": [
{"type": "video", "path": video_file_path},
{"type": "text", "text": "What is shown in this video?"},
],
},
]
]
def _process_messages_for_chat_template(
conversation,
batch_images,
batch_videos,
batch_video_metadata,
**chat_template_kwargs,
):
# Let us just always return a dummy prompt
new_msg = [
[
{
"role": "user",
"content": [
{"type": "video"}, # no need to use path, video is loaded already by this moment
{"type": "text", "text": "Dummy prompt for preprocess testing"},
],
},
]
]
return new_msg
processor._process_messages_for_chat_template = _process_messages_for_chat_template
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
num_frames=8,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
# Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
# Difference with common tests, InternVLProcessor returns flattened video features, and uses 8 frames by default
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 8)
@require_torch
@require_av
def test_apply_chat_template_video_frame_sampling(self):
@@ -393,13 +321,13 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
tokenize=True,
return_dict=True,
return_tensors="pt",
num_frames=4, # by default no more than 4 frames, otherwise too slow
num_frames=2, # by default no more than 2 frames, otherwise too slow
)
self.assertTrue(self.videos_input_name in out_dict)
self.assertEqual(len(out_dict["input_ids"]), batch_size)
self.assertEqual(len(out_dict["attention_mask"]), batch_size)
video_len = 4 if batch_size == 1 else 3 # InternVL patches out and removes frames after processing
video_len = 2 if batch_size == 1 else 3 # InternVL patches out and removes frames after processing
self.assertEqual(len(out_dict[self.videos_input_name]), video_len)
for k in out_dict:
self.assertIsInstance(out_dict[k], torch.Tensor)