[video processors] support frame sampling within processors (#38105)

* apply updates smolVLM (still needs workaround for chat template) * add other models * dump qwen omni for now, come back later * port qwen omni from their impl * wait, all qwens sample videos in same way! * clean up * make smolvlm backwards compatible and fix padding * dix some tests * fox smolvlm tests * more clean up and test fixing * delete unused arg * fix * address comments * style * fix test
2025-06-12 11:34:30 +02:00
parent 887054c714
commit 27459025b8
25 changed files with 864 additions and 795 deletions
--- a/tests/models/instructblipvideo/test_video_processing_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_video_processing_instructblipvideo.py
--- a/tests/models/internvl/test_processor_internvl.py
+++ b/tests/models/internvl/test_processor_internvl.py
@@ -17,7 +17,6 @@ import shutil
 import tempfile
 import unittest

-from huggingface_hub import hf_hub_download
 from parameterized import parameterized

 from transformers import AutoProcessor, AutoTokenizer, InternVLProcessor
@@ -180,77 +179,6 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            )
            images_patches_index += inputs["pixel_values"].shape[0]

-    # Override video chat_template tests as InternVLProcessor returns flattened video features
-    @require_av
-    @require_torch
-    def test_apply_chat_template_video_special_processing(self):
-        """
-        Tests that models can use their own preprocessing to preprocess conversations.
-        """
-        processor = self.get_processor()
-        if processor.chat_template is None:
-            self.skipTest("Processor has no chat template")
-
-        signature = inspect.signature(processor.__call__)
-        if "videos" not in {*signature.parameters.keys()} or (
-            signature.parameters.get("videos") is not None
-            and signature.parameters["videos"].annotation == inspect._empty
-        ):
-            self.skipTest("Processor doesn't accept videos at input")
-
-        video_file_path = hf_hub_download(
-            repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
-        )
-        messages = [
-            [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "video", "path": video_file_path},
-                        {"type": "text", "text": "What is shown in this video?"},
-                    ],
-                },
-            ]
-        ]
-
-        def _process_messages_for_chat_template(
-            conversation,
-            batch_images,
-            batch_videos,
-            batch_video_metadata,
-            **chat_template_kwargs,
-        ):
-            # Let us just always return a dummy prompt
-            new_msg = [
-                [
-                    {
-                        "role": "user",
-                        "content": [
-                            {"type": "video"},  # no need to use path, video is loaded already by this moment
-                            {"type": "text", "text": "Dummy prompt for preprocess testing"},
-                        ],
-                    },
-                ]
-            ]
-            return new_msg
-
-        processor._process_messages_for_chat_template = _process_messages_for_chat_template
-        out_dict_with_video = processor.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-            num_frames=8,
-        )
-        self.assertTrue(self.videos_input_name in out_dict_with_video)
-
-        # Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
-        formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
-        self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
-        # Difference with common tests, InternVLProcessor returns flattened video features, and uses 8 frames by default
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 8)
-
    @require_torch
    @require_av
    def test_apply_chat_template_video_frame_sampling(self):
@@ -393,13 +321,13 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
-            num_frames=4,  # by default no more than 4 frames, otherwise too slow
+            num_frames=2,  # by default no more than 2 frames, otherwise too slow
        )
        self.assertTrue(self.videos_input_name in out_dict)
        self.assertEqual(len(out_dict["input_ids"]), batch_size)
        self.assertEqual(len(out_dict["attention_mask"]), batch_size)

-        video_len = 4 if batch_size == 1 else 3  # InternVL patches out and removes frames after processing
+        video_len = 2 if batch_size == 1 else 3  # InternVL patches out and removes frames after processing
        self.assertEqual(len(out_dict[self.videos_input_name]), video_len)
        for k in out_dict:
            self.assertIsInstance(out_dict[k], torch.Tensor)
--- a/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py
+++ b/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py
@@ -407,14 +407,14 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            tokenize=True,
            return_dict=True,
            return_tensors=return_tensors,
-            num_frames=4,  # by default no more than 4 frames, otherwise too slow
+            num_frames=2,  # by default no more than 2 frames, otherwise too slow
        )
        input_name = getattr(self, input_name)
        self.assertTrue(input_name in out_dict)
        self.assertEqual(len(out_dict["input_ids"]), batch_size)
        self.assertEqual(len(out_dict["attention_mask"]), batch_size)

-        video_len = 5760 if batch_size == 1 else 5808  # qwen pixels don't scale with bs same way as other models
+        video_len = 2880 if batch_size == 1 else 5808  # qwen pixels don't scale with bs same way as other models
        mm_len = batch_size * 1564 if modality == "image" else video_len
        self.assertEqual(len(out_dict[input_name]), mm_len)

@@ -525,72 +525,6 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertTrue(self.videos_input_name in out_dict_with_video)
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 2904)

-    @require_av
-    def test_apply_chat_template_video_special_processing(self):
-        """
-        Tests that models can use their own preprocessing to preprocess conversations.
-        """
-        processor = self.get_processor()
-        if processor.chat_template is None:
-            self.skipTest("Processor has no chat template")
-
-        signature = inspect.signature(processor.__call__)
-        if "videos" not in {*signature.parameters.keys()} or (
-            signature.parameters.get("videos") is not None
-            and signature.parameters["videos"].annotation == inspect._empty
-        ):
-            self.skipTest("Processor doesn't accept videos at input")
-
-        video_file_path = hf_hub_download(
-            repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
-        )
-        messages = [
-            [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "video", "path": video_file_path},
-                        {"type": "text", "text": "What is shown in this video?"},
-                    ],
-                },
-            ]
-        ]
-
-        def _process_messages_for_chat_template(
-            conversation,
-            batch_images,
-            batch_videos,
-            batch_video_metadata,
-            **chat_template_kwargs,
-        ):
-            # Let us just always return a dummy prompt
-            new_msg = [
-                [
-                    {
-                        "role": "user",
-                        "content": [
-                            {"type": "video"},  # no need to use path, video is loaded already by this moment
-                            {"type": "text", "text": "Dummy prompt for preprocess testing"},
-                        ],
-                    },
-                ]
-            ]
-            return new_msg
-
-        processor._process_messages_for_chat_template = _process_messages_for_chat_template
-        out_dict_with_video = processor.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-        )
-        self.assertTrue(self.videos_input_name in out_dict_with_video)
-
-        # Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
-        formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
-        self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 145912)
-
    @require_librosa
    @require_av
    @unittest.skip(
--- a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
+++ b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
@@ -19,7 +19,6 @@ import unittest

 import numpy as np
 import pytest
-from huggingface_hub import hf_hub_download

 from transformers import AutoProcessor, Qwen2Tokenizer
 from transformers.testing_utils import require_av, require_torch, require_vision
@@ -219,14 +218,14 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            tokenize=True,
            return_dict=True,
            return_tensors=return_tensors,
-            num_frames=4,  # by default no more than 4 frames, otherwise too slow
+            num_frames=2,  # by default no more than 2 frames, otherwise too slow
        )
        input_name = getattr(self, input_name)
        self.assertTrue(input_name in out_dict)
        self.assertEqual(len(out_dict["input_ids"]), batch_size)
        self.assertEqual(len(out_dict["attention_mask"]), batch_size)

-        video_len = 360 if batch_size == 1 else 320  # qwen pixels don't scale with bs same way as other models
+        video_len = 180 if batch_size == 1 else 320  # qwen pixels don't scale with bs same way as other models
        mm_len = batch_size * 192 if modality == "image" else video_len
        self.assertEqual(len(out_dict[input_name]), mm_len)

@@ -346,70 +345,3 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertEqual(inputs[self.images_input_name].shape[0], 612)
        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
        self.assertEqual(inputs[self.images_input_name].shape[0], 100)
-
-    @require_av
-    def test_apply_chat_template_video_special_processing(self):
-        """
-        Tests that models can use their own preprocessing to preprocess conversations.
-        """
-        processor = self.get_processor()
-        if processor.chat_template is None:
-            self.skipTest("Processor has no chat template")
-
-        signature = inspect.signature(processor.__call__)
-        if "videos" not in {*signature.parameters.keys()} or (
-            signature.parameters.get("videos") is not None
-            and signature.parameters["videos"].annotation == inspect._empty
-        ):
-            self.skipTest("Processor doesn't accept videos at input")
-
-        video_file_path = hf_hub_download(
-            repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
-        )
-        messages = [
-            [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "video", "path": video_file_path},
-                        {"type": "text", "text": "What is shown in this video?"},
-                    ],
-                },
-            ]
-        ]
-
-        def _process_messages_for_chat_template(
-            conversation,
-            batch_images,
-            batch_videos,
-            batch_video_metadata,
-            **chat_template_kwargs,
-        ):
-            # Let us just always return a dummy prompt
-            new_msg = [
-                [
-                    {
-                        "role": "user",
-                        "content": [
-                            {"type": "video"},  # no need to use path, video is loaded already by this moment
-                            {"type": "text", "text": "Dummy prompt for preprocess testing"},
-                        ],
-                    },
-                ]
-            ]
-            return new_msg
-
-        processor._process_messages_for_chat_template = _process_messages_for_chat_template
-        out_dict_with_video = processor.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-        )
-        self.assertTrue(self.videos_input_name in out_dict_with_video)
-
-        # Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
-        formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
-        self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 21960)
--- a/tests/models/qwen2_vl/test_processor_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_processor_qwen2_vl.py
@@ -19,7 +19,6 @@ import unittest

 import numpy as np
 import pytest
-from huggingface_hub import hf_hub_download

 from transformers import AutoProcessor, Qwen2Tokenizer
 from transformers.testing_utils import require_av, require_torch, require_vision
@@ -220,14 +219,14 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            tokenize=True,
            return_dict=True,
            return_tensors=return_tensors,
-            num_frames=4,  # by default no more than 4 frames, otherwise too slow
+            num_frames=2,  # by default no more than 2 frames, otherwise too slow
        )
        input_name = getattr(self, input_name)
        self.assertTrue(input_name in out_dict)
        self.assertEqual(len(out_dict["input_ids"]), batch_size)
        self.assertEqual(len(out_dict["attention_mask"]), batch_size)

-        video_len = 360 if batch_size == 1 else 320  # qwen pixels don't scale with bs same way as other models
+        video_len = 180 if batch_size == 1 else 320  # qwen pixels don't scale with bs same way as other models
        mm_len = batch_size * 192 if modality == "image" else video_len
        self.assertEqual(len(out_dict[input_name]), mm_len)

@@ -337,73 +336,6 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertTrue(self.videos_input_name in out_dict_with_video)
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 160)

-    @require_av
-    def test_apply_chat_template_video_special_processing(self):
-        """
-        Tests that models can use their own preprocessing to preprocess conversations.
-        """
-        processor = self.get_processor()
-        if processor.chat_template is None:
-            self.skipTest("Processor has no chat template")
-
-        signature = inspect.signature(processor.__call__)
-        if "videos" not in {*signature.parameters.keys()} or (
-            signature.parameters.get("videos") is not None
-            and signature.parameters["videos"].annotation == inspect._empty
-        ):
-            self.skipTest("Processor doesn't accept videos at input")
-
-        video_file_path = hf_hub_download(
-            repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
-        )
-        messages = [
-            [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "video", "path": video_file_path},
-                        {"type": "text", "text": "What is shown in this video?"},
-                    ],
-                },
-            ]
-        ]
-
-        def _process_messages_for_chat_template(
-            conversation,
-            batch_images,
-            batch_videos,
-            batch_video_metadata,
-            **chat_template_kwargs,
-        ):
-            # Let us just always return a dummy prompt
-            new_msg = [
-                [
-                    {
-                        "role": "user",
-                        "content": [
-                            {"type": "video"},  # no need to use path, video is loaded already by this moment
-                            {"type": "text", "text": "Dummy prompt for preprocess testing"},
-                        ],
-                    },
-                ]
-            ]
-            return new_msg
-
-        processor._process_messages_for_chat_template = _process_messages_for_chat_template
-        out_dict_with_video = processor.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-        )
-        self.assertTrue(self.videos_input_name in out_dict_with_video)
-
-        # Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
-        formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
-        self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 21960)
-
    def test_kwargs_overrides_custom_image_processor_kwargs(self):
        processor = self.get_processor()
        self.skip_processor_without_typed_kwargs(processor)
--- a/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py
@@ -99,8 +99,9 @@ class Qwen2VLVideoProcessingTester:
        }

    @require_vision
-    def expected_output_video_shape(self, videos):
-        grid_t = self.num_frames // self.temporal_patch_size
+    def expected_output_video_shape(self, videos, num_frames=None):
+        num_frames = num_frames if num_frames is not None else self.num_frames
+        grid_t = num_frames // self.temporal_patch_size
        hidden_dim = self.num_channels * self.temporal_patch_size * self.patch_size * self.patch_size
        seq_len = 0
        for video in videos:
@@ -289,3 +290,70 @@ class Qwen2VLVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
            )[self.input_name]
            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
            self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
+
+    def test_call_sample_frames(self):
+        for video_processing_class in self.video_processor_list:
+            video_processing = video_processing_class(**self.video_processor_dict)
+
+            prev_num_frames = self.video_processor_tester.num_frames
+            self.video_processor_tester.num_frames = 8
+            video_inputs = self.video_processor_tester.prepare_video_inputs(
+                equal_resolution=False,
+                return_tensors="torch",
+            )
+
+            # Force set sampling to False. No sampling is expected even when `num_frames` exists
+            video_processing.do_sample_frames = False
+
+            encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=3)[self.input_name]
+            encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=3)[self.input_name]
+            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
+            expected_output_video_shape_batched = self.video_processor_tester.expected_output_video_shape(video_inputs)
+            self.assertListEqual(list(encoded_videos.shape), expected_output_video_shape)
+            self.assertListEqual(list(encoded_videos_batched.shape), expected_output_video_shape_batched)
+
+            # Set sampling to True. Video frames should be sampled with `num_frames` in the output
+            video_processing.do_sample_frames = True
+
+            encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=4)[self.input_name]
+            encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=4)[self.input_name]
+            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(
+                [video_inputs[0]], num_frames=4
+            )
+            expected_output_video_shape_batched = self.video_processor_tester.expected_output_video_shape(
+                video_inputs, num_frames=4
+            )
+            self.assertListEqual(list(encoded_videos.shape), expected_output_video_shape)
+            self.assertListEqual(list(encoded_videos_batched.shape), expected_output_video_shape_batched)
+
+            # Sample with `fps` requires metadata to infer number of frames from total duration
+            with self.assertRaises(ValueError):
+                encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=3)[self.input_name]
+                encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", fps=3)[self.input_name]
+
+            metadata = [[{"duration": 2.0, "total_num_frames": 8, "fps": 4}]]
+            batched_metadata = metadata * len(video_inputs)
+            encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=3, video_metadata=metadata)[
+                self.input_name
+            ]
+            encoded_videos_batched = video_processing(
+                video_inputs, return_tensors="pt", fps=3, video_metadata=batched_metadata
+            )[self.input_name]
+            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(
+                [video_inputs[0]], num_frames=6
+            )
+            expected_output_video_shape_batched = self.video_processor_tester.expected_output_video_shape(
+                video_inputs, num_frames=6
+            )
+            self.assertListEqual(list(encoded_videos.shape), expected_output_video_shape)
+            self.assertListEqual(list(encoded_videos_batched.shape), expected_output_video_shape_batched)
+
+            # We should raise error when asked to sample more frames than there are in input video
+            with self.assertRaises(ValueError):
+                encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=10)[self.input_name]
+                encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=10)[
+                    self.input_name
+                ]
+
+            # Assign back the actual num frames in tester
+            self.video_processor_tester.num_frames = prev_num_frames
--- a/tests/models/smolvlm/test_processor_smolvlm.py
+++ b/tests/models/smolvlm/test_processor_smolvlm.py
@@ -16,6 +16,7 @@ import shutil
 import tempfile
 import unittest
 from io import BytesIO
+from typing import Optional

 import numpy as np
 import requests
@@ -63,7 +64,7 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        )
        cls.bos_token = processor.tokenizer.bos_token
        cls.image_token = processor.image_token
-        cls.video_token = processor.image_token * 8  # SmolVLM uses image token and repeats it `num_frames` times
+        cls.video_token = processor.video_token
        cls.fake_image_token = processor.fake_image_token
        cls.global_img_token = processor.global_image_token

@@ -93,6 +94,13 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            "chat_template": "<|im_start|>{% for message in messages %}{{message['role'] | capitalize}}{% if message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif %}{% for line in message['content'] %}{% if line['type'] == 'text' %}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% endif %}{% endfor %}<end_of_utterance>\n{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
        }

+    def prepare_video_inputs(self, batch_size: Optional[int] = None):
+        """This function prepares a list of numpy videos."""
+        video_input = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)] * 8
+        if batch_size is None:
+            return [[video_input]]
+        return [[video_input]] * batch_size
+
    def get_split_image_expected_tokens(self, processor, image_rows, image_cols):
        text_split_images = []
        for n_h in range(image_rows):
@@ -347,7 +355,6 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
                    {"type": "text", "text": "What do these images show?"},
                    {"type": "image"},
                    {"type": "image"},
-                    "What do these images show?",
                ],
            },
            {
@@ -373,11 +380,8 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        )
        self.assertEqual(rendered, expected_rendered)

-    @unittest.skip(reason="SmolVLM replaced `type=video` with `type=image` in chat templates")
-    def test_apply_chat_template_video_special_processing(self):
-        pass
-
    @require_av
+    @require_torch
    def test_apply_chat_template_video_frame_sampling(self):
        # overridden because SmolVLM has special preprocessing for videos
        processor = self.get_processor()
@@ -406,7 +410,7 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            tokenize=True,
            return_dict=True,
            num_frames=num_frames,
-            return_tensors="np",
+            return_tensors="pt",
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
@@ -421,7 +425,7 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            tokenize=True,
            return_dict=True,
            video_fps=video_fps,
-            return_tensors="np",
+            return_tensors="pt",
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
@@ -482,11 +486,11 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            do_rescale=True,
            rescale_factor=-1,
            padding="max_length",
-            max_length=76,
+            max_length=172,
        )

        self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
+        self.assertEqual(len(inputs["input_ids"][0]), 172)

    @require_torch
    @require_vision
--- a/tests/models/smolvlm/test_video_processing_smolvlm.py
+++ b/tests/models/smolvlm/test_video_processing_smolvlm.py
@@ -15,22 +15,16 @@

 import unittest

-import numpy as np
-
 from transformers.image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD
 from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
+from transformers.utils import is_torchvision_available, is_vision_available

 from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs


-if is_torch_available():
-    import torch
-
 if is_vision_available():
    if is_torchvision_available():
        from transformers import SmolVLMVideoProcessor
-        from transformers.models.smolvlm.video_processing_smolvlm import get_resize_output_image_size


 class SmolVLMVideoProcessingTester:
@@ -58,6 +52,7 @@ class SmolVLMVideoProcessingTester:
        self.max_resolution = max_resolution
        self.do_resize = do_resize
        self.size = size
+        self.max_image_size = size
        self.do_normalize = do_normalize
        self.image_mean = image_mean
        self.image_std = image_std
@@ -71,17 +66,16 @@ class SmolVLMVideoProcessingTester:
            "image_mean": self.image_mean,
            "image_std": self.image_std,
            "do_convert_rgb": self.do_convert_rgb,
+            "max_image_size": self.max_image_size,
        }

    def expected_output_video_shape(self, videos):
-        max_height, max_width = 0, 0
-        if not isinstance(videos[0], torch.Tensor):
-            videos = [torch.tensor(np.array(video)).permute(0, -1, -3, -2) for video in videos]
-        for video in videos:
-            height, width = get_resize_output_image_size(video, self.size["longest_edge"])
-            max_height = max(height, max_height)
-            max_width = max(width, max_width)
-        return [self.num_frames, self.num_channels, max_height, max_width]
+        return [
+            self.num_frames,
+            self.num_channels,
+            self.max_image_size["longest_edge"],
+            self.max_image_size["longest_edge"],
+        ]

    def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
        videos = prepare_video_inputs(
@@ -116,3 +110,58 @@ class SmolVLMVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):

        video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict, size=42)
        self.assertEqual(video_processor.size, {"height": 42, "width": 42})
+
+    # overwrite, SmolVLM requires to have metadata no matter how we sample
+    def test_call_sample_frames(self):
+        for video_processing_class in self.video_processor_list:
+            video_processing = video_processing_class(**self.video_processor_dict)
+
+            prev_num_frames = self.video_processor_tester.num_frames
+            self.video_processor_tester.num_frames = 8
+            video_inputs = self.video_processor_tester.prepare_video_inputs(
+                equal_resolution=False,
+                return_tensors="torch",
+            )
+
+            # Force set sampling to False. No sampling is expected even when `num_frames` exists
+            video_processing.do_sample_frames = False
+
+            encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=3)[self.input_name]
+            encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=3)[self.input_name]
+            self.assertEqual(encoded_videos.shape[1], 8)
+            self.assertEqual(encoded_videos_batched.shape[1], 8)
+
+            # Set sampling to True. Video frames should be sampled with `num_frames` in the output
+            video_processing.do_sample_frames = True
+            metadata = [[{"duration": 2.0, "total_num_frames": 8, "fps": 4}]]
+            batched_metadata = metadata * len(video_inputs)
+
+            # Sample with `fps` requires metadata to infer number of frames from total duration
+            with self.assertRaises(ValueError):
+                encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=6, fps=3)[
+                    self.input_name
+                ]
+                encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=6, fps=3)[
+                    self.input_name
+                ]
+
+            encoded_videos = video_processing(
+                video_inputs[0], return_tensors="pt", num_frames=6, fps=3, video_metadata=metadata
+            )[self.input_name]
+            encoded_videos_batched = video_processing(
+                video_inputs, return_tensors="pt", num_frames=6, fps=3, video_metadata=batched_metadata
+            )[self.input_name]
+            self.assertEqual(encoded_videos.shape[1], 6)
+            self.assertEqual(encoded_videos_batched.shape[1], 6)
+
+            # We should raise error when asked to sample more frames than there are in input video
+            with self.assertRaises(ValueError):
+                encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=10, num_frames=20)[
+                    self.input_name
+                ]
+                encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", fps=10, num_frames=20)[
+                    self.input_name
+                ]
+
+            # Assign back the actual num frames in tester
+            self.video_processor_tester.num_frames = prev_num_frames
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -507,7 +507,7 @@ class ProcessorTesterMixin:
        if "video_processor" not in self.processor_class.attributes:
            self.skipTest(f"video_processor attribute not present in {self.processor_class}")
        processor_components = self.prepare_components()
-        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
+        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=167, padding="max_length")
        processor_kwargs = self.prepare_processor_dict()

        processor = self.processor_class(**processor_components, **processor_kwargs)
@@ -515,7 +515,7 @@ class ProcessorTesterMixin:
        input_str = self.prepare_text_inputs(modality="video")
        video_input = self.prepare_video_inputs()
        inputs = processor(text=input_str, videos=video_input, return_tensors="pt")
-        self.assertEqual(inputs[self.text_input_name].shape[-1], 117)
+        self.assertEqual(inputs[self.text_input_name].shape[-1], 167)

    def test_video_processor_defaults_preserved_by_video_kwargs(self):
        """
@@ -529,7 +529,7 @@ class ProcessorTesterMixin:
        processor_components["video_processor"] = self.get_component(
            "video_processor", do_rescale=True, rescale_factor=-1
        )
-        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
+        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=167, padding="max_length")
        processor_kwargs = self.prepare_processor_dict()

        processor = self.processor_class(**processor_components, **processor_kwargs)
@@ -553,9 +553,9 @@ class ProcessorTesterMixin:
        input_str = self.prepare_text_inputs(modality="video")
        video_input = self.prepare_video_inputs()
        inputs = processor(
-            text=input_str, videos=video_input, return_tensors="pt", max_length=112, padding="max_length"
+            text=input_str, videos=video_input, return_tensors="pt", max_length=162, padding="max_length"
        )
-        self.assertEqual(inputs[self.text_input_name].shape[-1], 112)
+        self.assertEqual(inputs[self.text_input_name].shape[-1], 162)

    def test_kwargs_overrides_default_video_processor_kwargs(self):
        if "video_processor" not in self.processor_class.attributes:
@@ -564,7 +564,7 @@ class ProcessorTesterMixin:
        processor_components["video_processor"] = self.get_component(
            "video_processor", do_rescale=True, rescale_factor=1
        )
-        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
+        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=167, padding="max_length")
        processor_kwargs = self.prepare_processor_dict()

        processor = self.processor_class(**processor_components, **processor_kwargs)
@@ -593,11 +593,11 @@ class ProcessorTesterMixin:
            do_rescale=True,
            rescale_factor=-1,
            padding="max_length",
-            max_length=76,
+            max_length=176,
        )

        self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
-        self.assertEqual(inputs[self.text_input_name].shape[-1], 76)
+        self.assertEqual(inputs[self.text_input_name].shape[-1], 176)

    def test_unstructured_kwargs_batched_video(self):
        if "video_processor" not in self.processor_class.attributes:
@@ -616,13 +616,13 @@ class ProcessorTesterMixin:
            do_rescale=True,
            rescale_factor=-1,
            padding="longest",
-            max_length=76,
+            max_length=176,
        )

        self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
        self.assertTrue(
            len(inputs[self.text_input_name][0]) == len(inputs[self.text_input_name][1])
-            and len(inputs[self.text_input_name][1]) < 76
+            and len(inputs[self.text_input_name][1]) < 176
        )

    def test_doubly_passed_kwargs_video(self):
@@ -659,14 +659,14 @@ class ProcessorTesterMixin:
        all_kwargs = {
            "common_kwargs": {"return_tensors": "pt"},
            "videos_kwargs": {"do_rescale": True, "rescale_factor": -1},
-            "text_kwargs": {"padding": "max_length", "max_length": 76},
+            "text_kwargs": {"padding": "max_length", "max_length": 176},
        }

        inputs = processor(text=input_str, videos=video_input, **all_kwargs)
        self.skip_processor_without_typed_kwargs(processor)

        self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
-        self.assertEqual(inputs[self.text_input_name].shape[-1], 76)
+        self.assertEqual(inputs[self.text_input_name].shape[-1], 176)

    def test_structured_kwargs_nested_from_dict_video(self):
        if "video_processor" not in self.processor_class.attributes:
@@ -682,12 +682,12 @@ class ProcessorTesterMixin:
        all_kwargs = {
            "common_kwargs": {"return_tensors": "pt"},
            "videos_kwargs": {"do_rescale": True, "rescale_factor": -1},
-            "text_kwargs": {"padding": "max_length", "max_length": 76},
+            "text_kwargs": {"padding": "max_length", "max_length": 176},
        }

        inputs = processor(text=input_str, videos=video_input, **all_kwargs)
        self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
-        self.assertEqual(inputs[self.text_input_name].shape[-1], 76)
+        self.assertEqual(inputs[self.text_input_name].shape[-1], 176)

    # TODO: the same test, but for audio + text processors that have strong overlap in kwargs
    # TODO (molbap) use the same structure of attribute kwargs for other tests to avoid duplication
@@ -884,7 +884,7 @@ class ProcessorTesterMixin:
            tokenize=True,
            return_dict=True,
            return_tensors=return_tensors,
-            num_frames=4,  # by default no more than 4 frames, otherwise too slow
+            num_frames=2,  # by default no more than 2 frames, otherwise too slow
        )
        input_name = getattr(self, input_name)
        self.assertTrue(input_name in out_dict)
@@ -983,6 +983,21 @@ class ProcessorTesterMixin:
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), video_fps * 10)

+        # Whan `do_sample_frames=False` no sampling is done and whole video is loaded, even if number of frames is passed
+        video_fps = 1
+        out_dict_with_video = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            do_sample_frames=False,
+            video_fps=video_fps,
+            return_tensors="pt",
+        )
+        self.assertTrue(self.videos_input_name in out_dict_with_video)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 300)
+
        # Load with `video_fps` and `num_frames` args, should raise an error
        with self.assertRaises(ValueError):
            out_dict_with_video = processor.apply_chat_template(
@@ -1024,75 +1039,6 @@ class ProcessorTesterMixin:
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 2)

-    @require_av
-    @require_torch
-    def test_apply_chat_template_video_special_processing(self):
-        """
-        Tests that models can use their own preprocessing to preprocess conversations.
-        """
-        processor = self.get_processor()
-        if processor.chat_template is None:
-            self.skipTest("Processor has no chat template")
-
-        signature = inspect.signature(processor.__call__)
-        if "videos" not in {*signature.parameters.keys()} or (
-            signature.parameters.get("videos") is not None
-            and signature.parameters["videos"].annotation == inspect._empty
-        ):
-            self.skipTest("Processor doesn't accept videos at input")
-
-        video_file_path = hf_hub_download(
-            repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
-        )
-        messages = [
-            [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "video", "path": video_file_path},
-                        {"type": "text", "text": "What is shown in this video?"},
-                    ],
-                },
-            ]
-        ]
-
-        def _process_messages_for_chat_template(
-            conversation,
-            batch_images,
-            batch_videos,
-            batch_video_metadata,
-            **chat_template_kwargs,
-        ):
-            # Let us just always return a dummy prompt
-            new_msg = [
-                [
-                    {
-                        "role": "user",
-                        "content": [
-                            {"type": "video"},  # no need to use path, video is loaded already by this moment
-                            {"type": "text", "text": "Dummy prompt for preprocess testing"},
-                        ],
-                    },
-                ]
-            ]
-            return new_msg
-
-        processor._process_messages_for_chat_template = _process_messages_for_chat_template
-        out_dict_with_video = processor.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-        )
-        self.assertTrue(self.videos_input_name in out_dict_with_video)
-
-        # Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
-        formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
-        self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 243)
-
    @require_librosa
    @require_av
    def test_chat_template_audio_from_video(self):
--- a/tests/test_video_processing_common.py
+++ b/tests/test_video_processing_common.py
@@ -293,6 +293,59 @@ class VideoProcessingTestMixin:
                (self.video_processor_tester.batch_size, *expected_output_video_shape),
            )

+    def test_call_sample_frames(self):
+        for video_processing_class in self.video_processor_list:
+            video_processing = video_processing_class(**self.video_processor_dict)
+
+            prev_num_frames = self.video_processor_tester.num_frames
+            self.video_processor_tester.num_frames = 8
+            video_inputs = self.video_processor_tester.prepare_video_inputs(
+                equal_resolution=False,
+                return_tensors="torch",
+            )
+
+            # Force set sampling to False. No sampling is expected even when `num_frames` exists
+            video_processing.do_sample_frames = False
+
+            encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=3)[self.input_name]
+            encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=3)[self.input_name]
+            self.assertEqual(encoded_videos.shape[1], 8)
+            self.assertEqual(encoded_videos_batched.shape[1], 8)
+
+            # Set sampling to True. Video frames should be sampled with `num_frames` in the output
+            video_processing.do_sample_frames = True
+
+            encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=3)[self.input_name]
+            encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=3)[self.input_name]
+            self.assertEqual(encoded_videos.shape[1], 3)
+            self.assertEqual(encoded_videos_batched.shape[1], 3)
+
+            # Sample with `fps` requires metadata to infer number of frames from total duration
+            with self.assertRaises(ValueError):
+                encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=3)[self.input_name]
+                encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", fps=3)[self.input_name]
+
+            metadata = [[{"duration": 2.0, "total_num_frames": 8, "fps": 4}]]
+            batched_metadata = metadata * len(video_inputs)
+            encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=3, video_metadata=metadata)[
+                self.input_name
+            ]
+            encoded_videos_batched = video_processing(
+                video_inputs, return_tensors="pt", fps=3, video_metadata=batched_metadata
+            )[self.input_name]
+            self.assertEqual(encoded_videos.shape[1], 6)
+            self.assertEqual(encoded_videos_batched.shape[1], 6)
+
+            # We should raise error when asked to sample more frames than there are in input video
+            with self.assertRaises(ValueError):
+                encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=10)[self.input_name]
+                encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=10)[
+                    self.input_name
+                ]
+
+            # Assign back the actual num frames in tester
+            self.video_processor_tester.num_frames = prev_num_frames
+
    def test_nested_input(self):
        """Tests that the processor can work with nested list where each video is a list of arrays"""
        for video_processing_class in self.video_processor_list: