🔴 Video processors as a separate class (#35206)

* initial design * update all video processors * add tests * need to add qwen2-vl (not tested yet) * add qwen2-vl in auto map * fix copies * isort * resolve confilicts kinda * nit: * qwen2-vl is happy now * qwen2-5 happy * other models are happy * fix copies * fix tests * add docs * CI green now? * add more tests * even more changes + tests * doc builder fail * nit * Update src/transformers/models/auto/processing_auto.py Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com> * small update * imports correctly * dump, otherwise this is getting unmanagebale T-T * dump * update * another update * update * tests * move * modular * docs * test * another update * init * remove flakiness in tests * fixup * clean up and remove commented lines * docs * skip this one! * last fix after rebasing * run fixup * delete slow files * remove unnecessary tests + clean up a bit * small fixes * fix tests * more updates * docs * fix tests * update * style * fix qwen2-5-vl * fixup * fixup * unflatten batch when preparing * dump, come back soon * add docs and fix some tests * how to guard this with new dummies? * chat templates in qwen * address some comments * remove `Fast` suffix * fixup * oops should be imported from transforms * typo in requires dummies * new model added with video support * fixup once more * last fixup I hope * revert image processor name + comments * oh, this is why fetch test is failing * fix tests * fix more tests * fixup * add new models: internvl, smolvlm * update docs * imprt once * fix failing tests * do we need to guard it here again, why? * new model was added, update it * remove testcase from tester * fix tests * make style * not related CI fail, lets' just fix here * mark flaky for now, filas 15 out of 100 * style * maybe we can do this way? * don't download images in setup class --------- Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
2025-05-12 11:55:51 +02:00
parent 716819b830
commit a31fa218ad
83 changed files with 5418 additions and 2004 deletions
--- a/tests/models/internvl/test_processor_internvl.py
+++ b/tests/models/internvl/test_processor_internvl.py
@@ -18,12 +18,13 @@ import tempfile
 import unittest

 from huggingface_hub import hf_hub_download
+from parameterized import parameterized

 from transformers import AutoProcessor, AutoTokenizer, InternVLProcessor
 from transformers.testing_utils import require_av, require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available

-from ...test_processing_common import ProcessorTesterMixin
+from ...test_processing_common import MODALITY_INPUT_DATA, ProcessorTesterMixin


 if is_torch_available():
@@ -31,7 +32,7 @@ if is_torch_available():


 if is_vision_available():
-    from transformers import GotOcr2ImageProcessor
+    from transformers import GotOcr2ImageProcessor, InternVLVideoProcessor


@require_vision
@@ -55,12 +56,22 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            image_std=[0.229, 0.224, 0.225],
            do_convert_rgb=True,
        )
+        video_processor = InternVLVideoProcessor(
+            do_resize=True,
+            size={"height": 20, "width": 20},
+            do_rescale=True,
+            rescale_factor=1 / 255,
+            do_normalize=True,
+            image_mean=[0.485, 0.456, 0.406],
+            image_std=[0.229, 0.224, 0.225],
+            do_convert_rgb=True,
+        )
        tokenizer = AutoTokenizer.from_pretrained("OpenGVLab/InternVL3-1B-hf", padding_side="left")
        processor_kwargs = cls.prepare_processor_dict()
-        processor = InternVLProcessor.from_pretrained(
-            "OpenGVLab/InternVL3-1B-hf",
+        processor = InternVLProcessor(
            image_processor=image_processor,
            tokenizer=tokenizer,
+            video_processor=video_processor,
            **processor_kwargs,
        )
        processor.save_pretrained(cls.tmpdirname)
@@ -69,7 +80,7 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):

    @staticmethod
    def prepare_processor_dict():
-        return {"image_seq_length": 10}
+        return {"image_seq_length": 2}

    def get_tokenizer(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
@@ -77,6 +88,9 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def get_image_processor(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor

+    def get_video_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor
+
    def get_processor(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)

@@ -168,6 +182,7 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):

    # Override video chat_template tests as InternVLProcessor returns flattened video features
    @require_av
+    @require_torch
    def test_apply_chat_template_video_special_processing(self):
        """
        Tests that models can use their own preprocessing to preprocess conversations.
@@ -225,7 +240,7 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
-            return_tensors="np",
+            return_tensors="pt",
            num_frames=8,
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)
@@ -236,6 +251,8 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        # Difference with common tests, InternVLProcessor returns flattened video features, and uses 8 frames by default
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 8)

+    @require_torch
+    @require_av
    def test_apply_chat_template_video_frame_sampling(self):
        processor = self.get_processor()

@@ -271,7 +288,7 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            tokenize=True,
            return_dict=True,
            num_frames=num_frames,
-            return_tensors="np",
+            return_tensors="pt",
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), num_frames)
@@ -284,6 +301,7 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
+            return_tensors="pt",
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 300)
@@ -302,6 +320,97 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
+            return_tensors="pt",
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 2)
+
+    @require_av
+    @parameterized.expand([(1, "pt"), (2, "pt")])
+    def test_apply_chat_template_video(self, batch_size: int, return_tensors: str):
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        if "video_processor" not in self.processor_class.attributes:
+            self.skipTest(f"`video_processor` attribute not present in {self.processor_class}")
+
+        batch_messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [{"type": "text", "text": "Describe this."}],
+                },
+            ]
+        ] * batch_size
+
+        # Test that jinja can be applied
+        formatted_prompt = processor.apply_chat_template(batch_messages, add_generation_prompt=True, tokenize=False)
+        self.assertEqual(len(formatted_prompt), batch_size)
+
+        # Test that tokenizing with template and directly with `self.tokenizer` gives same output
+        formatted_prompt_tokenized = processor.apply_chat_template(
+            batch_messages, add_generation_prompt=True, tokenize=True, return_tensors="pt"
+        )
+        add_special_tokens = True
+        if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token):
+            add_special_tokens = False
+        tok_output = processor.tokenizer(formatted_prompt, return_tensors="pt", add_special_tokens=add_special_tokens)
+        expected_output = tok_output.input_ids
+        self.assertListEqual(expected_output.tolist(), formatted_prompt_tokenized.tolist())
+
+        # Test that kwargs passed to processor's `__call__` are actually used
+        tokenized_prompt_100 = processor.apply_chat_template(
+            batch_messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+            max_length=100,
+        )
+        self.assertEqual(len(tokenized_prompt_100[0]), 100)
+
+        # Test that `return_dict=True` returns text related inputs in the dict
+        out_dict_text = processor.apply_chat_template(
+            batch_messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
+        self.assertTrue(all(key in out_dict_text for key in ["input_ids", "attention_mask"]))
+        self.assertEqual(len(out_dict_text["input_ids"]), batch_size)
+        self.assertEqual(len(out_dict_text["attention_mask"]), batch_size)
+
+        # Test that with modality URLs and `return_dict=True`, we get modality inputs in the dict
+        for idx, url in enumerate(MODALITY_INPUT_DATA["videos"][:batch_size]):
+            batch_messages[idx][0]["content"] = [batch_messages[idx][0]["content"][0], {"type": "video", "url": url}]
+
+        out_dict = processor.apply_chat_template(
+            batch_messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+            num_frames=4,  # by default no more than 4 frames, otherwise too slow
+        )
+        self.assertTrue(self.videos_input_name in out_dict)
+        self.assertEqual(len(out_dict["input_ids"]), batch_size)
+        self.assertEqual(len(out_dict["attention_mask"]), batch_size)
+
+        video_len = 4 if batch_size == 1 else 3  # InternVL patches out and removes frames after processing
+        self.assertEqual(len(out_dict[self.videos_input_name]), video_len)
+        for k in out_dict:
+            self.assertIsInstance(out_dict[k], torch.Tensor)
+
+        # Test continue from final message
+        assistant_message = {
+            "role": "assistant",
+            "content": [{"type": "text", "text": "It is the sound of"}],
+        }
+        for batch_idx in range(batch_size):
+            batch_messages[batch_idx] = batch_messages[batch_idx] + [assistant_message]
+        continue_prompt = processor.apply_chat_template(batch_messages, continue_final_message=True, tokenize=False)
+        for prompt in continue_prompt:
+            self.assertTrue(prompt.endswith("It is the sound of"))  # no `eos` token at the end
--- a/tests/models/internvl/test_video_processor_internvl.py
+++ b/tests/models/internvl/test_video_processor_internvl.py
@@ -0,0 +1,107 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
+
+from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs
+
+
+if is_torch_available():
+    pass
+
+if is_vision_available():
+    if is_torchvision_available():
+        from transformers import InternVLVideoProcessor
+
+
+class InternVLVideoProcessingTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=5,
+        num_frames=8,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=80,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=OPENAI_CLIP_MEAN,
+        image_std=OPENAI_CLIP_STD,
+        do_convert_rgb=True,
+    ):
+        size = size if size is not None else {"height": 384, "width": 384}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_frames = num_frames
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+
+    def prepare_video_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+        }
+
+    def expected_output_video_shape(self, videos):
+        return [self.num_frames, self.num_channels, self.size["height"], self.size["width"]]
+
+    def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
+        videos = prepare_video_inputs(
+            batch_size=self.batch_size,
+            num_frames=self.num_frames,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            return_tensors=return_tensors,
+        )
+        return videos
+
+
+@require_torch
+@require_vision
+class InternVLVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
+    fast_video_processing_class = InternVLVideoProcessor if is_torchvision_available() else None
+
+    def setUp(self):
+        super().setUp()
+        self.video_processor_tester = InternVLVideoProcessingTester(self)
+
+    @property
+    def video_processor_dict(self):
+        return self.video_processor_tester.prepare_video_processor_dict()
+
+    def test_video_processor_from_dict_with_kwargs(self):
+        video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict)
+        self.assertEqual(video_processor.size, {"height": 384, "width": 384})
+
+        video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict, size=42)
+        self.assertEqual(video_processor.size, {"height": 42, "width": 42})