🔴 Video processors as a separate class (#35206)

* initial design * update all video processors * add tests * need to add qwen2-vl (not tested yet) * add qwen2-vl in auto map * fix copies * isort * resolve confilicts kinda * nit: * qwen2-vl is happy now * qwen2-5 happy * other models are happy * fix copies * fix tests * add docs * CI green now? * add more tests * even more changes + tests * doc builder fail * nit * Update src/transformers/models/auto/processing_auto.py Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com> * small update * imports correctly * dump, otherwise this is getting unmanagebale T-T * dump * update * another update * update * tests * move * modular * docs * test * another update * init * remove flakiness in tests * fixup * clean up and remove commented lines * docs * skip this one! * last fix after rebasing * run fixup * delete slow files * remove unnecessary tests + clean up a bit * small fixes * fix tests * more updates * docs * fix tests * update * style * fix qwen2-5-vl * fixup * fixup * unflatten batch when preparing * dump, come back soon * add docs and fix some tests * how to guard this with new dummies? * chat templates in qwen * address some comments * remove `Fast` suffix * fixup * oops should be imported from transforms * typo in requires dummies * new model added with video support * fixup once more * last fixup I hope * revert image processor name + comments * oh, this is why fetch test is failing * fix tests * fix more tests * fixup * add new models: internvl, smolvlm * update docs * imprt once * fix failing tests * do we need to guard it here again, why? * new model was added, update it * remove testcase from tester * fix tests * make style * not related CI fail, lets' just fix here * mark flaky for now, filas 15 out of 100 * style * maybe we can do this way? * don't download images in setup class --------- Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
2025-05-12 11:55:51 +02:00
parent 716819b830
commit a31fa218ad
83 changed files with 5418 additions and 2004 deletions
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -539,7 +539,7 @@ class ProcessorTesterMixin:
        video_input = self.prepare_video_inputs()

        inputs = processor(text=input_str, videos=video_input, return_tensors="pt")
-        self.assertLessEqual(inputs[self.videos_input_name][0][0][0].mean(), 0)
+        self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)

    def test_kwargs_overrides_default_tokenizer_kwargs_video(self):
        if "video_processor" not in self.processor_class.attributes:
@@ -574,7 +574,7 @@ class ProcessorTesterMixin:
        video_input = self.prepare_video_inputs()

        inputs = processor(text=input_str, videos=video_input, do_rescale=True, rescale_factor=-1, return_tensors="pt")
-        self.assertLessEqual(inputs[self.videos_input_name][0][0][0].mean(), 0)
+        self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)

    def test_unstructured_kwargs_video(self):
        if "video_processor" not in self.processor_class.attributes:
@@ -596,7 +596,7 @@ class ProcessorTesterMixin:
            max_length=76,
        )

-        self.assertLessEqual(inputs[self.videos_input_name][0][0][0].mean(), 0)
+        self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
        self.assertEqual(inputs[self.text_input_name].shape[-1], 76)

    def test_unstructured_kwargs_batched_video(self):
@@ -619,7 +619,7 @@ class ProcessorTesterMixin:
            max_length=76,
        )

-        self.assertLessEqual(inputs[self.videos_input_name][0][0][0].mean(), 0)
+        self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
        self.assertTrue(
            len(inputs[self.text_input_name][0]) == len(inputs[self.text_input_name][1])
            and len(inputs[self.text_input_name][1]) < 76
@@ -665,7 +665,7 @@ class ProcessorTesterMixin:
        inputs = processor(text=input_str, videos=video_input, **all_kwargs)
        self.skip_processor_without_typed_kwargs(processor)

-        self.assertLessEqual(inputs[self.videos_input_name][0][0][0].mean(), 0)
+        self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
        self.assertEqual(inputs[self.text_input_name].shape[-1], 76)

    def test_structured_kwargs_nested_from_dict_video(self):
@@ -686,7 +686,7 @@ class ProcessorTesterMixin:
        }

        inputs = processor(text=input_str, videos=video_input, **all_kwargs)
-        self.assertLessEqual(inputs[self.videos_input_name][0][0][0].mean(), 0)
+        self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
        self.assertEqual(inputs[self.text_input_name].shape[-1], 76)

    # TODO: the same test, but for audio + text processors that have strong overlap in kwargs
@@ -907,15 +907,15 @@ class ProcessorTesterMixin:
        for prompt in continue_prompt:
            self.assertTrue(prompt.endswith("It is the sound of"))  # no `eos` token at the end

-    @require_av
+    @require_librosa
    @parameterized.expand([(1, "np"), (1, "pt"), (2, "np"), (2, "pt")])
    def test_apply_chat_template_audio(self, batch_size: int, return_tensors: str):
        self._test_apply_chat_template(
            "audio", batch_size, return_tensors, "audio_input_name", "feature_extracttor", MODALITY_INPUT_DATA["audio"]
        )

-    @require_librosa
-    @parameterized.expand([(1, "np"), (1, "pt"), (2, "np"), (2, "pt")])
+    @require_av
+    @parameterized.expand([(1, "pt"), (2, "pt")])  # video processor suports only torchvision
    def test_apply_chat_template_video(self, batch_size: int, return_tensors: str):
        self._test_apply_chat_template(
            "video", batch_size, return_tensors, "videos_input_name", "video_processor", MODALITY_INPUT_DATA["videos"]
@@ -927,6 +927,7 @@ class ProcessorTesterMixin:
            "image", batch_size, return_tensors, "images_input_name", "image_processor", MODALITY_INPUT_DATA["images"]
        )

+    @require_torch
    def test_apply_chat_template_video_frame_sampling(self):
        processor = self.get_processor()

@@ -962,7 +963,7 @@ class ProcessorTesterMixin:
            tokenize=True,
            return_dict=True,
            num_frames=num_frames,
-            return_tensors="np",
+            return_tensors="pt",
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
@@ -976,7 +977,7 @@ class ProcessorTesterMixin:
            tokenize=True,
            return_dict=True,
            video_fps=video_fps,
-            return_tensors="np",
+            return_tensors="pt",
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
@@ -1024,6 +1025,7 @@ class ProcessorTesterMixin:
        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 2)

    @require_av
+    @require_torch
    def test_apply_chat_template_video_special_processing(self):
        """
        Tests that models can use their own preprocessing to preprocess conversations.
@@ -1081,7 +1083,7 @@ class ProcessorTesterMixin:
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
-            return_tensors="np",
+            return_tensors="pt",
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)