[video processors] support frame sampling within processors (#38105)

* apply updates smolVLM (still needs workaround for chat template) * add other models * dump qwen omni for now, come back later * port qwen omni from their impl * wait, all qwens sample videos in same way! * clean up * make smolvlm backwards compatible and fix padding * dix some tests * fox smolvlm tests * more clean up and test fixing * delete unused arg * fix * address comments * style * fix test
2025-06-12 11:34:30 +02:00
parent 887054c714
commit 27459025b8
25 changed files with 864 additions and 795 deletions
--- a/tests/models/internvl/test_processor_internvl.py
+++ b/tests/models/internvl/test_processor_internvl.py
@@ -17,7 +17,6 @@ import shutil
 import tempfile
 import unittest

-from huggingface_hub import hf_hub_download
 from parameterized import parameterized

 from transformers import AutoProcessor, AutoTokenizer, InternVLProcessor
@@ -180,77 +179,6 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            )
            images_patches_index += inputs["pixel_values"].shape[0]

-    # Override video chat_template tests as InternVLProcessor returns flattened video features
-    @require_av
-    @require_torch
-    def test_apply_chat_template_video_special_processing(self):
-        """
-        Tests that models can use their own preprocessing to preprocess conversations.
-        """
-        processor = self.get_processor()
-        if processor.chat_template is None:
-            self.skipTest("Processor has no chat template")
-
-        signature = inspect.signature(processor.__call__)
-        if "videos" not in {*signature.parameters.keys()} or (
-            signature.parameters.get("videos") is not None
-            and signature.parameters["videos"].annotation == inspect._empty
-        ):
-            self.skipTest("Processor doesn't accept videos at input")
-
-        video_file_path = hf_hub_download(
-            repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
-        )
-        messages = [
-            [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "video", "path": video_file_path},
-                        {"type": "text", "text": "What is shown in this video?"},
-                    ],
-                },
-            ]
-        ]
-
-        def _process_messages_for_chat_template(
-            conversation,
-            batch_images,
-            batch_videos,
-            batch_video_metadata,
-            **chat_template_kwargs,
-        ):
-            # Let us just always return a dummy prompt
-            new_msg = [
-                [
-                    {
-                        "role": "user",
-                        "content": [
-                            {"type": "video"},  # no need to use path, video is loaded already by this moment
-                            {"type": "text", "text": "Dummy prompt for preprocess testing"},
-                        ],
-                    },
-                ]
-            ]
-            return new_msg
-
-        processor._process_messages_for_chat_template = _process_messages_for_chat_template
-        out_dict_with_video = processor.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-            num_frames=8,
-        )
-        self.assertTrue(self.videos_input_name in out_dict_with_video)
-
-        # Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
-        formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
-        self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
-        # Difference with common tests, InternVLProcessor returns flattened video features, and uses 8 frames by default
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 8)
-
    @require_torch
    @require_av
    def test_apply_chat_template_video_frame_sampling(self):
@@ -393,13 +321,13 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
-            num_frames=4,  # by default no more than 4 frames, otherwise too slow
+            num_frames=2,  # by default no more than 2 frames, otherwise too slow
        )
        self.assertTrue(self.videos_input_name in out_dict)
        self.assertEqual(len(out_dict["input_ids"]), batch_size)
        self.assertEqual(len(out_dict["attention_mask"]), batch_size)

-        video_len = 4 if batch_size == 1 else 3  # InternVL patches out and removes frames after processing
+        video_len = 2 if batch_size == 1 else 3  # InternVL patches out and removes frames after processing
        self.assertEqual(len(out_dict[self.videos_input_name]), video_len)
        for k in out_dict:
            self.assertIsInstance(out_dict[k], torch.Tensor)