Prepare processors for VideoLLMs (#36149)

* allow processor to preprocess conversation + video metadata * allow callable * add test * fix test * nit: fix * add metadata frames_indices * Update src/transformers/processing_utils.py Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com> * Update src/transformers/processing_utils.py Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com> * port updates from Orr and add one more test * Update src/transformers/processing_utils.py Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com> * typo * as dataclass * style * docstring + maek sure tests green --------- Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com>
2025-02-14 11:34:08 +01:00
parent 33d1d715b0
commit 15ec971b8e
3 changed files with 428 additions and 138 deletions
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -22,6 +22,7 @@ from pathlib import Path
 from typing import Optional

 import numpy as np
+from huggingface_hub import hf_hub_download

 from transformers.models.auto.processing_auto import processor_class_from_name
 from transformers.processing_utils import Unpack
@@ -538,7 +539,7 @@ class ProcessorTesterMixin:

    def test_chat_template_save_loading(self):
        processor = self.get_processor()
-        signature = inspect.signature(processor.__call__)
+        signature = inspect.signature(processor.__init__)
        if "chat_template" not in {*signature.parameters.keys()}:
            self.skipTest("Processor doesn't accept chat templates at input")

@@ -858,3 +859,119 @@ class ProcessorTesterMixin:
        self.assertTrue(self.videos_input_name in out_dict_with_video)
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 2)
+
+    @require_av
+    def test_chat_template_video_custom_sampling(self):
+        """
+        Tests that models can pass their custom callables to sample video indices.
+        """
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        signature = inspect.signature(processor.__call__)
+        if "videos" not in {*signature.parameters.keys()} or (
+            signature.parameters.get("videos") is not None
+            and signature.parameters["videos"].annotation == inspect._empty
+        ):
+            self.skipTest("Processor doesn't accept videos at input")
+
+        video_file_path = hf_hub_download(
+            repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
+        )
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "video",
+                            "path": video_file_path,
+                        },
+                        {"type": "text", "text": "What is shown in this video?"},
+                    ],
+                },
+            ]
+        ]
+
+        def dummmy_sample_indices_fn(metadata, **fn_kwargs):
+            # sample only the first two frame always
+            return [0, 1]
+
+        out_dict_with_video = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            sample_indices_fn=dummmy_sample_indices_fn,
+        )
+        self.assertTrue(self.videos_input_name in out_dict_with_video)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 2)
+
+    @require_av
+    def test_chat_template_video_special_processing(self):
+        """
+        Tests that models can use their own preprocessing to preprocess conversations.
+        """
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        signature = inspect.signature(processor.__call__)
+        if "videos" not in {*signature.parameters.keys()} or (
+            signature.parameters.get("videos") is not None
+            and signature.parameters["videos"].annotation == inspect._empty
+        ):
+            self.skipTest("Processor doesn't accept videos at input")
+
+        video_file_path = hf_hub_download(
+            repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
+        )
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "video", "path": video_file_path},
+                        {"type": "text", "text": "What is shown in this video?"},
+                    ],
+                },
+            ]
+        ]
+
+        def _process_messages_for_chat_template(
+            conversation,
+            batch_images,
+            batch_videos,
+            batch_video_metadata,
+            **chat_template_kwargs,
+        ):
+            # Let us just always return a dummy prompt
+            new_msg = [
+                [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "video"},  # no need to use path, video is loaded already by this moment
+                            {"type": "text", "text": "Dummy prompt for preprocess testing"},
+                        ],
+                    },
+                ]
+            ]
+            return new_msg
+
+        processor._process_messages_for_chat_template = _process_messages_for_chat_template
+        out_dict_with_video = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+        )
+        self.assertTrue(self.videos_input_name in out_dict_with_video)
+
+        # Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
+        formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
+        self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 243)