🔴 Video processors as a separate class (#35206)

* initial design * update all video processors * add tests * need to add qwen2-vl (not tested yet) * add qwen2-vl in auto map * fix copies * isort * resolve confilicts kinda * nit: * qwen2-vl is happy now * qwen2-5 happy * other models are happy * fix copies * fix tests * add docs * CI green now? * add more tests * even more changes + tests * doc builder fail * nit * Update src/transformers/models/auto/processing_auto.py Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com> * small update * imports correctly * dump, otherwise this is getting unmanagebale T-T * dump * update * another update * update * tests * move * modular * docs * test * another update * init * remove flakiness in tests * fixup * clean up and remove commented lines * docs * skip this one! * last fix after rebasing * run fixup * delete slow files * remove unnecessary tests + clean up a bit * small fixes * fix tests * more updates * docs * fix tests * update * style * fix qwen2-5-vl * fixup * fixup * unflatten batch when preparing * dump, come back soon * add docs and fix some tests * how to guard this with new dummies? * chat templates in qwen * address some comments * remove `Fast` suffix * fixup * oops should be imported from transforms * typo in requires dummies * new model added with video support * fixup once more * last fixup I hope * revert image processor name + comments * oh, this is why fetch test is failing * fix tests * fix more tests * fixup * add new models: internvl, smolvlm * update docs * imprt once * fix failing tests * do we need to guard it here again, why? * new model was added, update it * remove testcase from tester * fix tests * make style * not related CI fail, lets' just fix here * mark flaky for now, filas 15 out of 100 * style * maybe we can do this way? * don't download images in setup class --------- Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
2025-05-12 11:55:51 +02:00
parent 716819b830
commit a31fa218ad
83 changed files with 5418 additions and 2004 deletions
--- a/tests/models/auto/test_image_processing_auto.py
+++ b/tests/models/auto/test_image_processing_auto.py
@@ -73,6 +73,19 @@ class AutoImageProcessorTest(unittest.TestCase):
            config = AutoImageProcessor.from_pretrained(tmpdirname)
            self.assertIsInstance(config, CLIPImageProcessor)

+    def test_image_processor_from_new_filename(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
+            config_tmpfile = Path(tmpdirname) / "config.json"
+            json.dump(
+                {"image_processor_type": "CLIPImageProcessor", "processor_class": "CLIPProcessor"},
+                open(processor_tmpfile, "w"),
+            )
+            json.dump({"model_type": "clip"}, open(config_tmpfile, "w"))
+
+            config = AutoImageProcessor.from_pretrained(tmpdirname)
+            self.assertIsInstance(config, CLIPImageProcessor)
+
    def test_image_processor_from_local_directory_from_config(self):
        with tempfile.TemporaryDirectory() as tmpdirname:
            model_config = CLIPConfig()
--- a/tests/models/auto/test_processor_auto.py
+++ b/tests/models/auto/test_processor_auto.py
@@ -40,7 +40,11 @@ from transformers import (
 )
 from transformers.testing_utils import TOKEN, TemporaryHubRepo, get_tests_dir, is_staging_test
 from transformers.tokenization_utils import TOKENIZER_CONFIG_FILE
-from transformers.utils import FEATURE_EXTRACTOR_NAME, PROCESSOR_NAME, is_tokenizers_available
+from transformers.utils import (
+    FEATURE_EXTRACTOR_NAME,
+    PROCESSOR_NAME,
+    is_tokenizers_available,
+)


 sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
@@ -395,6 +399,13 @@ class AutoFeatureExtractorTest(unittest.TestCase):
        processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-convnext")
        self.assertEqual(processor.__class__.__name__, "ConvNextImageProcessor")

+    def test_auto_processor_save_load(self):
+        processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            processor.save_pretrained(tmp_dir)
+            second_processor = AutoProcessor.from_pretrained(tmp_dir)
+            self.assertEqual(second_processor.__class__.__name__, processor.__class__.__name__)
+

@is_staging_test
 class ProcessorPushToHubTester(unittest.TestCase):
--- a/tests/models/auto/test_video_processing_auto.py
+++ b/tests/models/auto/test_video_processing_auto.py
@@ -0,0 +1,252 @@
+# coding=utf-8
+# Copyright 2025 the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import sys
+import tempfile
+import unittest
+from pathlib import Path
+
+import transformers
+from transformers import (
+    CONFIG_MAPPING,
+    VIDEO_PROCESSOR_MAPPING,
+    AutoConfig,
+    AutoVideoProcessor,
+    LlavaOnevisionConfig,
+    LlavaOnevisionVideoProcessor,
+)
+from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, require_torch
+
+
+sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
+
+from test_module.custom_configuration import CustomConfig  # noqa E402
+from test_module.custom_video_processing import CustomVideoProcessor  # noqa E402
+
+
+@require_torch
+class AutoVideoProcessorTest(unittest.TestCase):
+    def setUp(self):
+        transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
+
+    def test_video_processor_from_model_shortcut(self):
+        config = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
+        self.assertIsInstance(config, LlavaOnevisionVideoProcessor)
+
+    def test_video_processor_from_local_directory_from_key(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            processor_tmpfile = Path(tmpdirname) / "video_preprocessor_config.json"
+            config_tmpfile = Path(tmpdirname) / "config.json"
+            json.dump(
+                {
+                    "video_processor_type": "LlavaOnevisionVideoProcessor",
+                    "processor_class": "LlavaOnevisionProcessor",
+                },
+                open(processor_tmpfile, "w"),
+            )
+            json.dump({"model_type": "llava_onevision"}, open(config_tmpfile, "w"))
+
+            config = AutoVideoProcessor.from_pretrained(tmpdirname)
+            self.assertIsInstance(config, LlavaOnevisionVideoProcessor)
+
+    def test_video_processor_from_local_directory_from_preprocessor_key(self):
+        # Ensure we can load the image processor from the feature extractor config
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
+            config_tmpfile = Path(tmpdirname) / "config.json"
+            json.dump(
+                {
+                    "video_processor_type": "LlavaOnevisionVideoProcessor",
+                    "processor_class": "LlavaOnevisionProcessor",
+                },
+                open(processor_tmpfile, "w"),
+            )
+            json.dump({"model_type": "llava_onevision"}, open(config_tmpfile, "w"))
+
+            config = AutoVideoProcessor.from_pretrained(tmpdirname)
+            self.assertIsInstance(config, LlavaOnevisionVideoProcessor)
+
+    def test_video_processor_from_local_directory_from_config(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model_config = LlavaOnevisionConfig()
+
+            # Create a dummy config file with image_proceesor_type
+            processor_tmpfile = Path(tmpdirname) / "video_preprocessor_config.json"
+            config_tmpfile = Path(tmpdirname) / "config.json"
+            json.dump(
+                {
+                    "video_processor_type": "LlavaOnevisionVideoProcessor",
+                    "processor_class": "LlavaOnevisionProcessor",
+                },
+                open(processor_tmpfile, "w"),
+            )
+            json.dump({"model_type": "llava_onevision"}, open(config_tmpfile, "w"))
+
+            # remove video_processor_type to make sure config.json alone is enough to load image processor locally
+            config_dict = AutoVideoProcessor.from_pretrained(tmpdirname).to_dict()
+
+            config_dict.pop("video_processor_type")
+            config = LlavaOnevisionVideoProcessor(**config_dict)
+
+            # save in new folder
+            model_config.save_pretrained(tmpdirname)
+            config.save_pretrained(tmpdirname)
+
+            config = AutoVideoProcessor.from_pretrained(tmpdirname)
+
+            # make sure private variable is not incorrectly saved
+            dict_as_saved = json.loads(config.to_json_string())
+            self.assertTrue("_processor_class" not in dict_as_saved)
+
+        self.assertIsInstance(config, LlavaOnevisionVideoProcessor)
+
+    def test_video_processor_from_local_file(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            processor_tmpfile = Path(tmpdirname) / "video_preprocessor_config.json"
+            json.dump(
+                {
+                    "video_processor_type": "LlavaOnevisionVideoProcessor",
+                    "processor_class": "LlavaOnevisionProcessor",
+                },
+                open(processor_tmpfile, "w"),
+            )
+
+            config = AutoVideoProcessor.from_pretrained(processor_tmpfile)
+            self.assertIsInstance(config, LlavaOnevisionVideoProcessor)
+
+    def test_repo_not_found(self):
+        with self.assertRaisesRegex(
+            EnvironmentError,
+            "llava-hf/llava-doesnt-exist is not a local folder and is not a valid model identifier",
+        ):
+            _ = AutoVideoProcessor.from_pretrained("llava-hf/llava-doesnt-exist")
+
+    def test_revision_not_found(self):
+        with self.assertRaisesRegex(
+            EnvironmentError, r"aaaaaa is not a valid git identifier \(branch name, tag name or commit id\)"
+        ):
+            _ = AutoVideoProcessor.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa")
+
+    def test_video_processor_not_found(self):
+        with self.assertRaisesRegex(
+            EnvironmentError,
+            "hf-internal-testing/config-no-model does not appear to have a file named preprocessor_config.json.",
+        ):
+            _ = AutoVideoProcessor.from_pretrained("hf-internal-testing/config-no-model")
+
+    def test_from_pretrained_dynamic_video_processor(self):
+        # If remote code is not set, we will time out when asking whether to load the model.
+        with self.assertRaises(ValueError):
+            video_processor = AutoVideoProcessor.from_pretrained("hf-internal-testing/test_dynamic_video_processor")
+        # If remote code is disabled, we can't load this config.
+        with self.assertRaises(ValueError):
+            video_processor = AutoVideoProcessor.from_pretrained(
+                "hf-internal-testing/test_dynamic_video_processor", trust_remote_code=False
+            )
+
+        video_processor = AutoVideoProcessor.from_pretrained(
+            "hf-internal-testing/test_dynamic_video_processor", trust_remote_code=True
+        )
+        self.assertEqual(video_processor.__class__.__name__, "NewVideoProcessor")
+
+        # Test the dynamic module is loaded only once.
+        reloaded_video_processor = AutoVideoProcessor.from_pretrained(
+            "hf-internal-testing/test_dynamic_video_processor", trust_remote_code=True
+        )
+        self.assertIs(video_processor.__class__, reloaded_video_processor.__class__)
+
+        # Test image processor can be reloaded.
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            video_processor.save_pretrained(tmp_dir)
+            reloaded_video_processor = AutoVideoProcessor.from_pretrained(tmp_dir, trust_remote_code=True)
+        self.assertEqual(reloaded_video_processor.__class__.__name__, "NewVideoProcessor")
+
+        # The image processor file is cached in the snapshot directory. So the module file is not changed after dumping
+        # to a temp dir. Because the revision of the module file is not changed.
+        # Test the dynamic module is loaded only once if the module file is not changed.
+        self.assertIs(video_processor.__class__, reloaded_video_processor.__class__)
+
+        # Test the dynamic module is reloaded if we force it.
+        reloaded_video_processor = AutoVideoProcessor.from_pretrained(
+            "hf-internal-testing/test_dynamic_video_processor", trust_remote_code=True, force_download=True
+        )
+        self.assertIsNot(video_processor.__class__, reloaded_video_processor.__class__)
+
+    def test_new_video_processor_registration(self):
+        try:
+            AutoConfig.register("custom", CustomConfig)
+            AutoVideoProcessor.register(CustomConfig, CustomVideoProcessor)
+            # Trying to register something existing in the Transformers library will raise an error
+            with self.assertRaises(ValueError):
+                AutoVideoProcessor.register(LlavaOnevisionConfig, LlavaOnevisionVideoProcessor)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                processor_tmpfile = Path(tmpdirname) / "video_preprocessor_config.json"
+                config_tmpfile = Path(tmpdirname) / "config.json"
+                json.dump(
+                    {
+                        "video_processor_type": "LlavaOnevisionVideoProcessor",
+                        "processor_class": "LlavaOnevisionProcessor",
+                    },
+                    open(processor_tmpfile, "w"),
+                )
+                json.dump({"model_type": "llava_onevision"}, open(config_tmpfile, "w"))
+
+                video_processor = CustomVideoProcessor.from_pretrained(tmpdirname)
+
+            # Now that the config is registered, it can be used as any other config with the auto-API
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                video_processor.save_pretrained(tmp_dir)
+                new_video_processor = AutoVideoProcessor.from_pretrained(tmp_dir)
+                self.assertIsInstance(new_video_processor, CustomVideoProcessor)
+
+        finally:
+            if "custom" in CONFIG_MAPPING._extra_content:
+                del CONFIG_MAPPING._extra_content["custom"]
+            if CustomConfig in VIDEO_PROCESSOR_MAPPING._extra_content:
+                del VIDEO_PROCESSOR_MAPPING._extra_content[CustomConfig]
+
+    def test_from_pretrained_dynamic_video_processor_conflict(self):
+        class NewVideoProcessor(LlavaOnevisionVideoProcessor):
+            is_local = True
+
+        try:
+            AutoConfig.register("custom", CustomConfig)
+            AutoVideoProcessor.register(CustomConfig, NewVideoProcessor)
+            # If remote code is not set, the default is to use local
+            video_processor = AutoVideoProcessor.from_pretrained("hf-internal-testing/test_dynamic_video_processor")
+            self.assertEqual(video_processor.__class__.__name__, "NewVideoProcessor")
+            self.assertTrue(video_processor.is_local)
+
+            # If remote code is disabled, we load the local one.
+            video_processor = AutoVideoProcessor.from_pretrained(
+                "hf-internal-testing/test_dynamic_video_processor", trust_remote_code=False
+            )
+            self.assertEqual(video_processor.__class__.__name__, "NewVideoProcessor")
+            self.assertTrue(video_processor.is_local)
+
+            # If remote is enabled, we load from the Hub
+            video_processor = AutoVideoProcessor.from_pretrained(
+                "hf-internal-testing/test_dynamic_video_processor", trust_remote_code=True
+            )
+            self.assertEqual(video_processor.__class__.__name__, "NewVideoProcessor")
+            self.assertTrue(not hasattr(video_processor, "is_local"))
+
+        finally:
+            if "custom" in CONFIG_MAPPING._extra_content:
+                del CONFIG_MAPPING._extra_content["custom"]
+            if CustomConfig in VIDEO_PROCESSOR_MAPPING._extra_content:
+                del VIDEO_PROCESSOR_MAPPING._extra_content[CustomConfig]
--- a/tests/models/instructblipvideo/test_image_processing_instrictblipvideo.py
+++ b/tests/models/instructblipvideo/test_image_processing_instrictblipvideo.py
@@ -1,190 +0,0 @@
-# Copyright 2024 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import InstructBlipVideoImageProcessor
-
-
-class InstructBlipVideoProcessingTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=5,
-        num_channels=3,
-        image_size=24,
-        min_resolution=30,
-        max_resolution=80,
-        do_resize=True,
-        size=None,
-        do_normalize=True,
-        image_mean=OPENAI_CLIP_MEAN,
-        image_std=OPENAI_CLIP_STD,
-        do_convert_rgb=True,
-        frames=4,
-    ):
-        size = size if size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_convert_rgb = do_convert_rgb
-        self.frames = frames
-
-    def prepare_image_processor_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_convert_rgb": self.do_convert_rgb,
-        }
-
-    def expected_output_image_shape(self, images):
-        return self.frames, self.num_channels, self.size["height"], self.size["width"]
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        images = prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-        # let's simply copy the frames to fake a long video-clip
-        if numpify or torchify:
-            videos = []
-            for image in images:
-                if numpify:
-                    video = image[None, ...].repeat(self.frames, 0)
-                else:
-                    video = image[None, ...].repeat(self.frames, 1, 1, 1)
-                videos.append(video)
-        else:
-            videos = []
-            for pil_image in images:
-                videos.append([pil_image] * self.frames)
-
-        return videos
-
-
-@require_torch
-@require_vision
-class InstructBlipVideoProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = InstructBlipVideoImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        super().setUp()
-        self.image_processor_tester = InstructBlipVideoProcessingTester(self)
-
-    @property
-    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.image_processor_dict
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"height": 18, "width": 18})
-
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
-        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
-
-    def test_call_pil(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        video_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
-        for video in video_inputs:
-            self.assertIsInstance(video[0], Image.Image)
-
-        # Test not batched input (pass as `videos` arg to test that ImageProcessor can handle videos in absence of images!)
-        encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values
-        expected_output_video_shape = (1, 4, 3, 18, 18)
-        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
-
-        # Test batched
-        encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values
-        expected_output_video_shape = (5, 4, 3, 18, 18)
-        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
-
-    def test_call_numpy(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        video_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)
-        for video in video_inputs:
-            self.assertIsInstance(video, np.ndarray)
-
-        # Test not batched input (pass as `videos` arg to test that ImageProcessor can handle videos in absence of images!)
-        encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values
-        expected_output_video_shape = (1, 4, 3, 18, 18)
-        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
-
-        # Test batched
-        encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values
-        expected_output_video_shape = (5, 4, 3, 18, 18)
-        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
-
-    def test_call_pytorch(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PyTorch tensors
-        video_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
-        for video in video_inputs:
-            self.assertIsInstance(video, torch.Tensor)
-
-        # Test not batched input
-        encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values
-        expected_output_video_shape = (1, 4, 3, 18, 18)
-        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
-
-        # Test batched
-        encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values
-        expected_output_video_shape = (5, 4, 3, 18, 18)
-        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
--- a/tests/models/instructblipvideo/test_processor_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_processor_instructblipvideo.py
@@ -17,8 +17,8 @@ import unittest

 import pytest

-from transformers.testing_utils import require_vision
-from transformers.utils import is_vision_available
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torchvision_available, is_vision_available

 from ...test_processing_common import ProcessorTesterMixin

@@ -28,14 +28,16 @@ if is_vision_available():
        AutoProcessor,
        BertTokenizerFast,
        GPT2Tokenizer,
-        InstructBlipVideoImageProcessor,
        InstructBlipVideoProcessor,
        PreTrainedTokenizerFast,
    )

+    if is_torchvision_available():
+        from transformers import InstructBlipVideoVideoProcessor
+

@require_vision
-# Copied from tests.models.instructblip.test_processor_instructblip.InstructBlipProcessorTest with InstructBlip->InstructBlipVideo, BlipImageProcessor->InstructBlipVideoImageProcessor
+@require_torch
 class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    processor_class = InstructBlipVideoProcessor

@@ -43,23 +45,23 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def setUpClass(cls):
        cls.tmpdirname = tempfile.mkdtemp()

-        image_processor = InstructBlipVideoImageProcessor()
+        video_processor = InstructBlipVideoVideoProcessor()
        tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-GPT2Model")
        qformer_tokenizer = BertTokenizerFast.from_pretrained("hf-internal-testing/tiny-random-bert")

-        processor = InstructBlipVideoProcessor(image_processor, tokenizer, qformer_tokenizer)
+        processor = InstructBlipVideoProcessor(video_processor, tokenizer, qformer_tokenizer)

        processor.save_pretrained(cls.tmpdirname)

    def get_tokenizer(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer

-    def get_image_processor(self, **kwargs):
-        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
-
    def get_qformer_tokenizer(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).qformer_tokenizer

+    def get_video_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor
+
    @classmethod
    def tearDownClass(cls):
        shutil.rmtree(cls.tmpdirname, ignore_errors=True)
@@ -67,14 +69,14 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def test_save_load_pretrained_additional_features(self):
        processor = InstructBlipVideoProcessor(
            tokenizer=self.get_tokenizer(),
-            image_processor=self.get_image_processor(),
+            video_processor=self.get_video_processor(),
            qformer_tokenizer=self.get_qformer_tokenizer(),
        )
        with tempfile.TemporaryDirectory() as tmpdir:
            processor.save_pretrained(tmpdir)

            tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-            image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
+            video_processor_add_kwargs = self.get_video_processor(do_normalize=False, padding_value=1.0)

            processor = InstructBlipVideoProcessor.from_pretrained(
                tmpdir, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
@@ -83,34 +85,34 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
        self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast)

-        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.image_processor, InstructBlipVideoImageProcessor)
+        self.assertEqual(processor.video_processor.to_json_string(), video_processor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.video_processor, InstructBlipVideoVideoProcessor)
        self.assertIsInstance(processor.qformer_tokenizer, BertTokenizerFast)

-    def test_image_processor(self):
-        image_processor = self.get_image_processor()
+    def test_video_processor(self):
+        video_processor = self.get_video_processor()
        tokenizer = self.get_tokenizer()
        qformer_tokenizer = self.get_qformer_tokenizer()

        processor = InstructBlipVideoProcessor(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
+            tokenizer=tokenizer, video_processor=video_processor, qformer_tokenizer=qformer_tokenizer
        )

        image_input = self.prepare_image_inputs()

-        input_feat_extract = image_processor(image_input, return_tensors="np")
-        input_processor = processor(images=image_input, return_tensors="np")
+        input_feat_extract = video_processor(image_input, return_tensors="pt")
+        input_processor = processor(images=image_input, return_tensors="pt")

        for key in input_feat_extract.keys():
            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)

    def test_tokenizer(self):
-        image_processor = self.get_image_processor()
+        video_processor = self.get_video_processor()
        tokenizer = self.get_tokenizer()
        qformer_tokenizer = self.get_qformer_tokenizer()

        processor = InstructBlipVideoProcessor(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
+            tokenizer=tokenizer, video_processor=video_processor, qformer_tokenizer=qformer_tokenizer
        )

        input_str = ["lower newer"]
@@ -127,12 +129,12 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            self.assertListEqual(encoded_tokens_qformer[key], encoded_processor["qformer_" + key])

    def test_processor(self):
-        image_processor = self.get_image_processor()
+        video_processor = self.get_video_processor()
        tokenizer = self.get_tokenizer()
        qformer_tokenizer = self.get_qformer_tokenizer()

        processor = InstructBlipVideoProcessor(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
+            tokenizer=tokenizer, video_processor=video_processor, qformer_tokenizer=qformer_tokenizer
        )

        input_str = "lower newer"
@@ -150,12 +152,12 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            processor()

    def test_tokenizer_decode(self):
-        image_processor = self.get_image_processor()
+        video_processor = self.get_video_processor()
        tokenizer = self.get_tokenizer()
        qformer_tokenizer = self.get_qformer_tokenizer()

        processor = InstructBlipVideoProcessor(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
+            tokenizer=tokenizer, video_processor=video_processor, qformer_tokenizer=qformer_tokenizer
        )

        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
@@ -166,12 +168,12 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertListEqual(decoded_tok, decoded_processor)

    def test_model_input_names(self):
-        image_processor = self.get_image_processor()
+        video_processor = self.get_video_processor()
        tokenizer = self.get_tokenizer()
        qformer_tokenizer = self.get_qformer_tokenizer()

        processor = InstructBlipVideoProcessor(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
+            tokenizer=tokenizer, video_processor=video_processor, qformer_tokenizer=qformer_tokenizer
        )

        input_str = "lower newer"
--- a/tests/models/instructblipvideo/test_video_processing_instrictblipvideo.py
+++ b/tests/models/instructblipvideo/test_video_processing_instrictblipvideo.py
@@ -0,0 +1,116 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torchvision_available, is_vision_available
+
+from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs
+
+
+if is_vision_available():
+    if is_torchvision_available():
+        from transformers import InstructBlipVideoVideoProcessor
+
+
+class InstructBlipVideoVideoProcessingTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=5,
+        num_channels=3,
+        num_frames=4,
+        min_resolution=30,
+        max_resolution=80,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=OPENAI_CLIP_MEAN,
+        image_std=OPENAI_CLIP_STD,
+        do_convert_rgb=True,
+    ):
+        super().__init__()
+        size = size if size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_frames = num_frames
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+
+    def prepare_video_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+        }
+
+    def expected_output_video_shape(self, images):
+        return self.num_frames, self.num_channels, self.size["height"], self.size["width"]
+
+    def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
+        videos = prepare_video_inputs(
+            batch_size=self.batch_size,
+            num_frames=self.num_frames,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            return_tensors=return_tensors,
+        )
+
+        return videos
+
+
+@require_torch
+@require_vision
+class InstructBlipVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
+    fast_video_processing_class = InstructBlipVideoVideoProcessor if is_torchvision_available() else None
+    input_name = "pixel_values"
+
+    def setUp(self):
+        super().setUp()
+        self.video_processor_tester = InstructBlipVideoVideoProcessingTester(self)
+
+    @property
+    def video_processor_dict(self):
+        return self.video_processor_tester.prepare_video_processor_dict()
+
+    def test_image_processor_properties(self):
+        video_processing = self.fast_video_processing_class(**self.video_processor_dict)
+        self.assertTrue(hasattr(video_processing, "do_resize"))
+        self.assertTrue(hasattr(video_processing, "size"))
+        self.assertTrue(hasattr(video_processing, "do_normalize"))
+        self.assertTrue(hasattr(video_processing, "image_mean"))
+        self.assertTrue(hasattr(video_processing, "image_std"))
+        self.assertTrue(hasattr(video_processing, "do_convert_rgb"))
+
+    def test_video_processor_from_dict_with_kwargs(self):
+        video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict)
+        self.assertEqual(video_processor.size, {"height": 18, "width": 18})
+
+        video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict, size=42)
+        self.assertEqual(video_processor.size, {"height": 42, "width": 42})
--- a/tests/models/internvl/test_processor_internvl.py
+++ b/tests/models/internvl/test_processor_internvl.py
@@ -18,12 +18,13 @@ import tempfile
 import unittest

 from huggingface_hub import hf_hub_download
+from parameterized import parameterized

 from transformers import AutoProcessor, AutoTokenizer, InternVLProcessor
 from transformers.testing_utils import require_av, require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available

-from ...test_processing_common import ProcessorTesterMixin
+from ...test_processing_common import MODALITY_INPUT_DATA, ProcessorTesterMixin


 if is_torch_available():
@@ -31,7 +32,7 @@ if is_torch_available():


 if is_vision_available():
-    from transformers import GotOcr2ImageProcessor
+    from transformers import GotOcr2ImageProcessor, InternVLVideoProcessor


@require_vision
@@ -55,12 +56,22 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            image_std=[0.229, 0.224, 0.225],
            do_convert_rgb=True,
        )
+        video_processor = InternVLVideoProcessor(
+            do_resize=True,
+            size={"height": 20, "width": 20},
+            do_rescale=True,
+            rescale_factor=1 / 255,
+            do_normalize=True,
+            image_mean=[0.485, 0.456, 0.406],
+            image_std=[0.229, 0.224, 0.225],
+            do_convert_rgb=True,
+        )
        tokenizer = AutoTokenizer.from_pretrained("OpenGVLab/InternVL3-1B-hf", padding_side="left")
        processor_kwargs = cls.prepare_processor_dict()
-        processor = InternVLProcessor.from_pretrained(
-            "OpenGVLab/InternVL3-1B-hf",
+        processor = InternVLProcessor(
            image_processor=image_processor,
            tokenizer=tokenizer,
+            video_processor=video_processor,
            **processor_kwargs,
        )
        processor.save_pretrained(cls.tmpdirname)
@@ -69,7 +80,7 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):

    @staticmethod
    def prepare_processor_dict():
-        return {"image_seq_length": 10}
+        return {"image_seq_length": 2}

    def get_tokenizer(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
@@ -77,6 +88,9 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def get_image_processor(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor

+    def get_video_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor
+
    def get_processor(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)

@@ -168,6 +182,7 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):

    # Override video chat_template tests as InternVLProcessor returns flattened video features
    @require_av
+    @require_torch
    def test_apply_chat_template_video_special_processing(self):
        """
        Tests that models can use their own preprocessing to preprocess conversations.
@@ -225,7 +240,7 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
-            return_tensors="np",
+            return_tensors="pt",
            num_frames=8,
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)
@@ -236,6 +251,8 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        # Difference with common tests, InternVLProcessor returns flattened video features, and uses 8 frames by default
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 8)

+    @require_torch
+    @require_av
    def test_apply_chat_template_video_frame_sampling(self):
        processor = self.get_processor()

@@ -271,7 +288,7 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            tokenize=True,
            return_dict=True,
            num_frames=num_frames,
-            return_tensors="np",
+            return_tensors="pt",
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), num_frames)
@@ -284,6 +301,7 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
+            return_tensors="pt",
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 300)
@@ -302,6 +320,97 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
+            return_tensors="pt",
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 2)
+
+    @require_av
+    @parameterized.expand([(1, "pt"), (2, "pt")])
+    def test_apply_chat_template_video(self, batch_size: int, return_tensors: str):
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        if "video_processor" not in self.processor_class.attributes:
+            self.skipTest(f"`video_processor` attribute not present in {self.processor_class}")
+
+        batch_messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [{"type": "text", "text": "Describe this."}],
+                },
+            ]
+        ] * batch_size
+
+        # Test that jinja can be applied
+        formatted_prompt = processor.apply_chat_template(batch_messages, add_generation_prompt=True, tokenize=False)
+        self.assertEqual(len(formatted_prompt), batch_size)
+
+        # Test that tokenizing with template and directly with `self.tokenizer` gives same output
+        formatted_prompt_tokenized = processor.apply_chat_template(
+            batch_messages, add_generation_prompt=True, tokenize=True, return_tensors="pt"
+        )
+        add_special_tokens = True
+        if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token):
+            add_special_tokens = False
+        tok_output = processor.tokenizer(formatted_prompt, return_tensors="pt", add_special_tokens=add_special_tokens)
+        expected_output = tok_output.input_ids
+        self.assertListEqual(expected_output.tolist(), formatted_prompt_tokenized.tolist())
+
+        # Test that kwargs passed to processor's `__call__` are actually used
+        tokenized_prompt_100 = processor.apply_chat_template(
+            batch_messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+            max_length=100,
+        )
+        self.assertEqual(len(tokenized_prompt_100[0]), 100)
+
+        # Test that `return_dict=True` returns text related inputs in the dict
+        out_dict_text = processor.apply_chat_template(
+            batch_messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
+        self.assertTrue(all(key in out_dict_text for key in ["input_ids", "attention_mask"]))
+        self.assertEqual(len(out_dict_text["input_ids"]), batch_size)
+        self.assertEqual(len(out_dict_text["attention_mask"]), batch_size)
+
+        # Test that with modality URLs and `return_dict=True`, we get modality inputs in the dict
+        for idx, url in enumerate(MODALITY_INPUT_DATA["videos"][:batch_size]):
+            batch_messages[idx][0]["content"] = [batch_messages[idx][0]["content"][0], {"type": "video", "url": url}]
+
+        out_dict = processor.apply_chat_template(
+            batch_messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+            num_frames=4,  # by default no more than 4 frames, otherwise too slow
+        )
+        self.assertTrue(self.videos_input_name in out_dict)
+        self.assertEqual(len(out_dict["input_ids"]), batch_size)
+        self.assertEqual(len(out_dict["attention_mask"]), batch_size)
+
+        video_len = 4 if batch_size == 1 else 3  # InternVL patches out and removes frames after processing
+        self.assertEqual(len(out_dict[self.videos_input_name]), video_len)
+        for k in out_dict:
+            self.assertIsInstance(out_dict[k], torch.Tensor)
+
+        # Test continue from final message
+        assistant_message = {
+            "role": "assistant",
+            "content": [{"type": "text", "text": "It is the sound of"}],
+        }
+        for batch_idx in range(batch_size):
+            batch_messages[batch_idx] = batch_messages[batch_idx] + [assistant_message]
+        continue_prompt = processor.apply_chat_template(batch_messages, continue_final_message=True, tokenize=False)
+        for prompt in continue_prompt:
+            self.assertTrue(prompt.endswith("It is the sound of"))  # no `eos` token at the end
--- a/tests/models/internvl/test_video_processor_internvl.py
+++ b/tests/models/internvl/test_video_processor_internvl.py
@@ -0,0 +1,107 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
+
+from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs
+
+
+if is_torch_available():
+    pass
+
+if is_vision_available():
+    if is_torchvision_available():
+        from transformers import InternVLVideoProcessor
+
+
+class InternVLVideoProcessingTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=5,
+        num_frames=8,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=80,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=OPENAI_CLIP_MEAN,
+        image_std=OPENAI_CLIP_STD,
+        do_convert_rgb=True,
+    ):
+        size = size if size is not None else {"height": 384, "width": 384}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_frames = num_frames
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+
+    def prepare_video_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+        }
+
+    def expected_output_video_shape(self, videos):
+        return [self.num_frames, self.num_channels, self.size["height"], self.size["width"]]
+
+    def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
+        videos = prepare_video_inputs(
+            batch_size=self.batch_size,
+            num_frames=self.num_frames,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            return_tensors=return_tensors,
+        )
+        return videos
+
+
+@require_torch
+@require_vision
+class InternVLVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
+    fast_video_processing_class = InternVLVideoProcessor if is_torchvision_available() else None
+
+    def setUp(self):
+        super().setUp()
+        self.video_processor_tester = InternVLVideoProcessingTester(self)
+
+    @property
+    def video_processor_dict(self):
+        return self.video_processor_tester.prepare_video_processor_dict()
+
+    def test_video_processor_from_dict_with_kwargs(self):
+        video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict)
+        self.assertEqual(video_processor.size, {"height": 384, "width": 384})
+
+        video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict, size=42)
+        self.assertEqual(video_processor.size, {"height": 42, "width": 42})
--- a/tests/models/llava_next_video/test_image_processing_llava_next_video.py
+++ b/tests/models/llava_next_video/test_image_processing_llava_next_video.py
@@ -1,218 +0,0 @@
-# Copyright 2024 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import LlavaNextVideoImageProcessor
-
-
-class LlavaNextVideoProcessingTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=5,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=80,
-        do_resize=True,
-        size=None,
-        do_center_crop=True,
-        crop_size=None,
-        do_normalize=True,
-        image_mean=OPENAI_CLIP_MEAN,
-        image_std=OPENAI_CLIP_STD,
-        do_convert_rgb=True,
-    ):
-        size = size if size is not None else {"shortest_edge": 20}
-        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_convert_rgb = do_convert_rgb
-
-    def prepare_image_processor_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_center_crop": self.do_center_crop,
-            "crop_size": self.crop_size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_convert_rgb": self.do_convert_rgb,
-        }
-
-    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.expected_output_image_shape
-    def expected_output_image_shape(self, images):
-        return self.num_channels, self.crop_size["height"], self.crop_size["width"]
-
-    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.prepare_image_inputs
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-    def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        images = prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-        # let's simply copy the frames to fake a long video-clip
-        if numpify or torchify:
-            videos = []
-            for image in images:
-                if numpify:
-                    video = image[None, ...].repeat(8, 0)
-                else:
-                    video = image[None, ...].repeat(8, 1, 1, 1)
-                videos.append(video)
-        else:
-            videos = []
-            for pil_image in images:
-                videos.append([pil_image] * 8)
-
-        return videos
-
-
-@require_torch
-@require_vision
-class LlavaNextVideoProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = LlavaNextVideoImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        super().setUp()
-        self.image_processor_tester = LlavaNextVideoProcessingTester(self)
-
-    @property
-    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.image_processor_dict
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "do_center_crop"))
-        self.assertTrue(hasattr(image_processing, "center_crop"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
-
-    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.test_image_processor_from_dict_with_kwargs
-    def test_image_processor_from_dict_with_kwargs(self):
-        for image_processing_class in self.image_processor_list:
-            image_processor = image_processing_class.from_dict(self.image_processor_dict)
-            self.assertEqual(image_processor.size, {"shortest_edge": 20})
-            self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
-
-            image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
-            self.assertEqual(image_processor.size, {"shortest_edge": 42})
-            self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
-
-    def test_call_pil(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True)
-        for video in video_inputs:
-            self.assertIsInstance(video[0], Image.Image)
-
-        # Test not batched input (pass as `videos` arg to test that ImageProcessor can handle videos in absence of images!)
-        encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values_videos
-        expected_output_video_shape = (1, 8, 3, 18, 18)
-        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
-
-        # Test batched
-        encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values_videos
-        expected_output_video_shape = (5, 8, 3, 18, 18)
-        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
-
-    def test_call_numpy(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True, numpify=True)
-        for video in video_inputs:
-            self.assertIsInstance(video, np.ndarray)
-
-        # Test not batched input (pass as `videos` arg to test that ImageProcessor can handle videos in absence of images!)
-        encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values_videos
-        expected_output_video_shape = (1, 8, 3, 18, 18)
-        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
-
-        # Test batched
-        encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values_videos
-        expected_output_video_shape = (5, 8, 3, 18, 18)
-        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
-
-    def test_call_pytorch(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PyTorch tensors
-        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True, torchify=True)
-        for video in video_inputs:
-            self.assertIsInstance(video, torch.Tensor)
-
-        # Test not batched input
-        encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values_videos
-        expected_output_video_shape = (1, 8, 3, 18, 18)
-        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
-
-        # Test batched
-        encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values_videos
-        expected_output_video_shape = (5, 8, 3, 18, 18)
-        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
-
-    @unittest.skip("LlavaNextVideoImageProcessor doesn't treat 4 channel PIL and numpy consistently yet")
-    def test_call_numpy_4_channels(self):
-        pass
--- a/tests/models/llava_next_video/test_processor_llava_next_video.py
+++ b/tests/models/llava_next_video/test_processor_llava_next_video.py
@@ -19,13 +19,16 @@ import unittest

 from transformers import AutoProcessor, LlamaTokenizerFast, LlavaNextVideoProcessor
 from transformers.testing_utils import require_vision
-from transformers.utils import is_torch_available, is_vision_available
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available

 from ...test_processing_common import ProcessorTesterMixin


 if is_vision_available():
-    from transformers import LlavaNextImageProcessor, LlavaNextVideoImageProcessor
+    from transformers import LlavaNextImageProcessor
+
+    if is_torchvision_available():
+        from transformers import LlavaNextVideoVideoProcessor

 if is_torch_available:
    pass
@@ -39,7 +42,7 @@ class LlavaNextVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def setUpClass(cls):
        cls.tmpdirname = tempfile.mkdtemp()
        image_processor = LlavaNextImageProcessor()
-        video_processor = LlavaNextVideoImageProcessor()
+        video_processor = LlavaNextVideoVideoProcessor()
        tokenizer = LlamaTokenizerFast.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
        tokenizer.add_special_tokens({"additional_special_tokens": ["<image>", "<video>"]})
        processor_kwargs = cls.prepare_processor_dict()
--- a/tests/models/llava_next_video/test_video_processing_llava_next_video.py
+++ b/tests/models/llava_next_video/test_video_processing_llava_next_video.py
@@ -0,0 +1,127 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
+
+from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs
+
+
+if is_torch_available():
+    pass
+
+if is_vision_available():
+    if is_torchvision_available():
+        from transformers import LlavaNextVideoVideoProcessor
+
+
+class LlavaNextVideoProcessingTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=5,
+        num_frames=8,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=80,
+        do_resize=True,
+        size=None,
+        do_center_crop=True,
+        crop_size=None,
+        do_normalize=True,
+        image_mean=OPENAI_CLIP_MEAN,
+        image_std=OPENAI_CLIP_STD,
+        do_convert_rgb=True,
+    ):
+        size = size if size is not None else {"height": 20, "width": 20}
+        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_frames = num_frames
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+
+    def prepare_video_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_center_crop": self.do_center_crop,
+            "crop_size": self.crop_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+        }
+
+    def expected_output_video_shape(self, images):
+        return self.num_frames, self.num_channels, self.crop_size["height"], self.crop_size["width"]
+
+    def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
+        videos = prepare_video_inputs(
+            batch_size=self.batch_size,
+            num_frames=self.num_frames,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            return_tensors=return_tensors,
+        )
+        return videos
+
+
+@require_torch
+@require_vision
+class LlavaNextVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
+    fast_video_processing_class = LlavaNextVideoVideoProcessor if is_torchvision_available() else None
+
+    def setUp(self):
+        super().setUp()
+        self.video_processor_tester = LlavaNextVideoProcessingTester(self)
+
+    @property
+    def video_processor_dict(self):
+        return self.video_processor_tester.prepare_video_processor_dict()
+
+    def test_video_processor_properties(self):
+        video_processing = self.fast_video_processing_class(**self.video_processor_dict)
+        self.assertTrue(hasattr(video_processing, "do_resize"))
+        self.assertTrue(hasattr(video_processing, "size"))
+        self.assertTrue(hasattr(video_processing, "do_center_crop"))
+        self.assertTrue(hasattr(video_processing, "center_crop"))
+        self.assertTrue(hasattr(video_processing, "do_normalize"))
+        self.assertTrue(hasattr(video_processing, "image_mean"))
+        self.assertTrue(hasattr(video_processing, "image_std"))
+        self.assertTrue(hasattr(video_processing, "do_convert_rgb"))
+
+    def test_video_processor_from_dict_with_kwargs(self):
+        video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict)
+        self.assertEqual(video_processor.size, {"height": 20, "width": 20})
+        self.assertEqual(video_processor.crop_size, {"height": 18, "width": 18})
+
+        video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict, size=42, crop_size=84)
+        self.assertEqual(video_processor.size, {"shortest_edge": 42})
+        self.assertEqual(video_processor.crop_size, {"height": 84, "width": 84})
--- a/tests/models/llava_onevision/test_image_processing_llava_onevision.py
+++ b/tests/models/llava_onevision/test_image_processing_llava_onevision.py
@@ -32,7 +32,7 @@ if is_vision_available():
    from transformers import LlavaOnevisionImageProcessor

    if is_torchvision_available():
-        from transformers import LlavaOnevisionImageProcessorFast, LlavaOnevisionVideoProcessor
+        from transformers import LlavaOnevisionImageProcessorFast


 class LlavaOnevisionImageProcessingTester:
@@ -91,41 +91,12 @@ class LlavaOnevisionImageProcessingTester:
            torchify=torchify,
        )

-    # Copied from tests.models.llava_next_video.test_image_processing_llava_next_video.LlavaNextVideoProcessingTester.prepare_video_inputs
-    def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        images = prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-        # let's simply copy the frames to fake a long video-clip
-        if numpify or torchify:
-            videos = []
-            for image in images:
-                if numpify:
-                    video = image[None, ...].repeat(8, 0)
-                else:
-                    video = image[None, ...].repeat(8, 1, 1, 1)
-                videos.append(video)
-        else:
-            videos = []
-            for pil_image in images:
-                videos.append([pil_image] * 8)
-
-        return videos
-

@require_torch
@require_vision
 class LlavaOnevisionImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = LlavaOnevisionImageProcessor if is_vision_available() else None
    fast_image_processing_class = LlavaOnevisionImageProcessorFast if is_torchvision_available() else None
-    video_processing_class = LlavaOnevisionVideoProcessor if is_vision_available() else None

    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->LlavaOnevision
    def setUp(self):
@@ -148,15 +119,6 @@ class LlavaOnevisionImageProcessingTest(ImageProcessingTestMixin, unittest.TestC
            self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
            self.assertTrue(hasattr(image_processing, "image_grid_pinpoints"))

-    def test_video_processor_properties(self):
-        image_processing = self.video_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
-
    def test_image_processor_from_dict_with_kwargs(self):
        for image_processing_class in self.image_processor_list:
            image_processor = image_processing_class.from_dict(self.image_processor_dict)
@@ -248,58 +210,6 @@ class LlavaOnevisionImageProcessingTest(ImageProcessingTestMixin, unittest.TestC
            # Image processor should return same pixel values, independently of input format
            self.assertTrue((encoded_images_nested == encoded_images).all())

-    def test_call_pil_video(self):
-        # Initialize image_processing
-        video_processing = self.video_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True)
-        for video in video_inputs:
-            self.assertIsInstance(video[0], Image.Image)
-
-        encoded_videos = video_processing(video_inputs[0], return_tensors="pt").pixel_values_videos
-        expected_output_video_shape = (1, 8, 3, 20, 20)
-        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
-
-        # Test batched
-        encoded_videos = video_processing(video_inputs, return_tensors="pt").pixel_values_videos
-        expected_output_video_shape = (7, 8, 3, 20, 20)
-        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
-
-    def test_call_numpy_video(self):
-        # Initialize image_processing
-        video_processing = self.video_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True, numpify=True)
-        for video in video_inputs:
-            self.assertIsInstance(video, np.ndarray)
-
-        encoded_videos = video_processing(video_inputs[0], return_tensors="pt").pixel_values_videos
-        expected_output_video_shape = (1, 8, 3, 20, 20)
-        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
-
-        # Test batched
-        encoded_videos = video_processing(video_inputs, return_tensors="pt").pixel_values_videos
-        expected_output_video_shape = (7, 8, 3, 20, 20)
-        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
-
-    def test_call_pytorch_video(self):
-        # Initialize image_processing
-        video_processing = self.video_processing_class(**self.image_processor_dict)
-        # create random PyTorch tensors
-        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True, torchify=True)
-        for video in video_inputs:
-            self.assertIsInstance(video, torch.Tensor)
-
-        # Test not batched input
-        encoded_videos = video_processing(video_inputs[0], return_tensors="pt").pixel_values_videos
-        expected_output_video_shape = (1, 8, 3, 20, 20)
-        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
-
-        # Test batched
-        encoded_videos = video_processing(video_inputs, return_tensors="pt").pixel_values_videos
-        expected_output_video_shape = (7, 8, 3, 20, 20)
-        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
-
    @unittest.skip(
        reason="LlavaOnevisionImageProcessorFast doesn't compile (infinitely) when using class transforms"
    )  # FIXME yoni
--- a/tests/models/llava_onevision/test_processor_llava_onevision.py
+++ b/tests/models/llava_onevision/test_processor_llava_onevision.py
@@ -16,8 +16,8 @@ import shutil
 import tempfile
 import unittest

-from transformers.testing_utils import require_vision
-from transformers.utils import is_torch_available, is_vision_available
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available

 from ...test_processing_common import ProcessorTesterMixin

@@ -27,15 +27,18 @@ if is_vision_available():
        AutoProcessor,
        LlavaOnevisionImageProcessor,
        LlavaOnevisionProcessor,
-        LlavaOnevisionVideoProcessor,
        Qwen2TokenizerFast,
    )

+    if is_torchvision_available():
+        from transformers import LlavaOnevisionVideoProcessor
+
 if is_torch_available:
    pass


@require_vision
+@require_torch
 class LlavaOnevisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    processor_class = LlavaOnevisionProcessor

--- a/tests/models/llava_onevision/test_video_processing_llava_onevision.py
+++ b/tests/models/llava_onevision/test_video_processing_llava_onevision.py
@@ -0,0 +1,116 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
+
+from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs
+
+
+if is_torch_available():
+    pass
+
+if is_vision_available():
+    if is_torchvision_available():
+        from transformers import LlavaOnevisionVideoProcessor
+
+
+class LlavaOnevisionVideoProcessingTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_frames=8,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=OPENAI_CLIP_MEAN,
+        image_std=OPENAI_CLIP_STD,
+        do_convert_rgb=True,
+    ):
+        size = size if size is not None else {"height": 20, "width": 20}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_frames = num_frames
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+
+    def prepare_video_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+        }
+
+    def expected_output_video_shape(self, video):
+        return self.num_frames, self.num_channels, self.size["height"], self.size["width"]
+
+    def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
+        videos = prepare_video_inputs(
+            batch_size=self.batch_size,
+            num_frames=self.num_frames,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            return_tensors=return_tensors,
+        )
+        return videos
+
+
+@require_torch
+@require_vision
+class LlavaOnevisionVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
+    fast_video_processing_class = LlavaOnevisionVideoProcessor if is_torchvision_available() else None
+
+    def setUp(self):
+        super().setUp()
+        self.video_processor_tester = LlavaOnevisionVideoProcessingTester(self)
+
+    @property
+    def video_processor_dict(self):
+        return self.video_processor_tester.prepare_video_processor_dict()
+
+    def test_video_processor_properties(self):
+        video_processing = self.fast_video_processing_class(**self.video_processor_dict)
+        self.assertTrue(hasattr(video_processing, "do_resize"))
+        self.assertTrue(hasattr(video_processing, "size"))
+        self.assertTrue(hasattr(video_processing, "do_normalize"))
+        self.assertTrue(hasattr(video_processing, "image_mean"))
+        self.assertTrue(hasattr(video_processing, "image_std"))
+        self.assertTrue(hasattr(video_processing, "do_convert_rgb"))
+
+    def test_video_processor_from_dict_with_kwargs(self):
+        video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict)
+        self.assertEqual(video_processor.size, {"height": 20, "width": 20})
+
+        video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict, size=42)
+        self.assertEqual(video_processor.size, {"shortest_edge": 42})
--- a/tests/models/mistral3/test_processor_mistral3.py
+++ b/tests/models/mistral3/test_processor_mistral3.py
@@ -16,7 +16,7 @@ import shutil
 import tempfile
 import unittest

-import requests
+import numpy as np

 from transformers import PixtralProcessor
 from transformers.testing_utils import require_vision
@@ -30,7 +30,7 @@ if is_torch_available():


 if is_vision_available():
-    from PIL import Image
+    pass


@require_vision
@@ -42,11 +42,10 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.url_0 = "https://www.ilankelman.org/stopsigns/australia.jpg"
-        cls.image_0 = Image.open(requests.get(cls.url_0, stream=True).raw)
+        cls.image_0 = np.random.randint(255, size=(3, 876, 1300), dtype=np.uint8)
        cls.url_1 = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        cls.image_1 = Image.open(requests.get(cls.url_1, stream=True).raw)
-        cls.url_2 = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
-        cls.image_2 = Image.open(requests.get(cls.url_2, stream=True).raw)
+        cls.image_1 = np.random.randint(255, size=(3, 480, 640), dtype=np.uint8)
+        cls.image_2 = np.random.randint(255, size=(3, 1024, 1024), dtype=np.uint8)

        cls.tmpdirname = tempfile.mkdtemp()
        cls.addClassCleanup(lambda tempdir=cls.tmpdirname: shutil.rmtree(tempdir))
--- a/tests/models/pixtral/test_processor_pixtral.py
+++ b/tests/models/pixtral/test_processor_pixtral.py
@@ -15,7 +15,7 @@ import shutil
 import tempfile
 import unittest

-import requests
+import numpy as np
 import torch

 from transformers.testing_utils import require_vision
@@ -25,8 +25,6 @@ from ...test_processing_common import ProcessorTesterMixin


 if is_vision_available():
-    from PIL import Image
-
    from transformers import PixtralProcessor


@@ -37,11 +35,10 @@ class PixtralProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.url_0 = "https://www.ilankelman.org/stopsigns/australia.jpg"
-        cls.image_0 = Image.open(requests.get(cls.url_0, stream=True).raw)
+        cls.image_0 = np.random.randint(255, size=(3, 876, 1300), dtype=np.uint8)
        cls.url_1 = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        cls.image_1 = Image.open(requests.get(cls.url_1, stream=True).raw)
-        cls.url_2 = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
-        cls.image_2 = Image.open(requests.get(cls.url_2, stream=True).raw)
+        cls.image_1 = np.random.randint(255, size=(3, 480, 640), dtype=np.uint8)
+        cls.image_2 = np.random.randint(255, size=(3, 1024, 1024), dtype=np.uint8)

    def setUp(self):
        self.tmpdirname = tempfile.mkdtemp()
--- a/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py
+++ b/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py
@@ -64,8 +64,12 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
+        video_processor = self.get_component("video_processor")
        processor = self.processor_class(
-            tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor
+            tokenizer=tokenizer,
+            video_processor=video_processor,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
        )
        self.skip_processor_without_typed_kwargs(processor)
        input_str = "lower newer"
@@ -91,8 +95,12 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
+        video_processor = self.get_component("video_processor")
        processor = self.processor_class(
-            tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor
+            tokenizer=tokenizer,
+            video_processor=video_processor,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
        )
        self.skip_processor_without_typed_kwargs(processor)

@@ -125,8 +133,12 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
+        video_processor = self.get_component("video_processor")
        processor = self.processor_class(
-            tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor
+            tokenizer=tokenizer,
+            video_processor=video_processor,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
        )
        self.skip_processor_without_typed_kwargs(processor)

@@ -159,7 +171,13 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
-        self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor)
+        video_processor = self.get_component("video_processor")
+        _ = self.processor_class(
+            tokenizer=tokenizer,
+            video_processor=video_processor,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+        )  # Why delete test? TODO: raushan double check tests after cleaning model

    @require_torch
    def test_kwargs_overrides_default_tokenizer_kwargs_audio(self):
@@ -175,7 +193,13 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
-        self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor)
+        video_processor = self.get_component("video_processor")
+        _ = self.processor_class(
+            tokenizer=tokenizer,
+            video_processor=video_processor,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+        )

    @classmethod
    def setUpClass(cls):
@@ -190,6 +214,9 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def get_image_processor(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor

+    def get_video_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor
+
    def get_feature_extractor(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).feature_extractor

@@ -212,10 +239,14 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
        feature_extractor = self.get_feature_extractor()
-
-        processor = Qwen2_5OmniProcessor(
-            image_processor=image_processor, feature_extractor=feature_extractor, tokenizer=tokenizer
+        video_processor = self.get_video_processor()
+        processor = self.processor_class(
+            tokenizer=tokenizer,
+            video_processor=video_processor,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
        )
+
        processor.save_pretrained(self.tmpdirname)
        processor = Qwen2_5OmniProcessor.from_pretrained(self.tmpdirname, use_fast=False)

@@ -230,9 +261,12 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
        feature_extractor = self.get_feature_extractor()
-
-        processor = Qwen2_5OmniProcessor(
-            image_processor=image_processor, feature_extractor=feature_extractor, tokenizer=tokenizer
+        video_processor = self.get_video_processor()
+        processor = self.processor_class(
+            tokenizer=tokenizer,
+            video_processor=video_processor,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
        )

        image_input = self.prepare_image_inputs()
@@ -247,9 +281,12 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
        feature_extractor = self.get_feature_extractor()
-
-        processor = Qwen2_5OmniProcessor(
-            image_processor=image_processor, feature_extractor=feature_extractor, tokenizer=tokenizer
+        video_processor = self.get_video_processor()
+        processor = self.processor_class(
+            tokenizer=tokenizer,
+            video_processor=video_processor,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
        )

        input_str = "lower newer"
@@ -281,9 +318,12 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
        feature_extractor = self.get_feature_extractor()
-
-        processor = Qwen2_5OmniProcessor(
-            image_processor=image_processor, feature_extractor=feature_extractor, tokenizer=tokenizer
+        video_processor = self.get_video_processor()
+        processor = self.processor_class(
+            tokenizer=tokenizer,
+            video_processor=video_processor,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
        )

        input_str = "lower newer"
@@ -377,7 +417,10 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertTrue(input_name in out_dict)
        self.assertEqual(len(out_dict["input_ids"]), batch_size)
        self.assertEqual(len(out_dict["attention_mask"]), batch_size)
-        self.assertEqual(len(out_dict[input_name]), batch_size * 1564)
+
+        video_len = 5760 if batch_size == 1 else 5808  # qwen pixels don't scale with bs same way as other models
+        mm_len = batch_size * 1564 if modality == "image" else video_len
+        self.assertEqual(len(out_dict[input_name]), mm_len)

        return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list}
        for k in out_dict:
--- a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
+++ b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
@@ -55,6 +55,9 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def get_image_processor(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor

+    def get_video_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor
+
    @staticmethod
    def prepare_processor_dict():
        return {
@@ -68,8 +71,11 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def test_save_load_pretrained_default(self):
        tokenizer = self.get_tokenizer()
        image_processor = self.get_image_processor()
+        video_processor = self.get_video_processor()

-        processor = Qwen2_5_VLProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        processor = Qwen2_5_VLProcessor(
+            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
+        )
        processor.save_pretrained(self.tmpdirname)
        processor = Qwen2_5_VLProcessor.from_pretrained(self.tmpdirname, use_fast=False)

@@ -81,8 +87,11 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def test_image_processor(self):
        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
+        video_processor = self.get_video_processor()

-        processor = Qwen2_5_VLProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        processor = Qwen2_5_VLProcessor(
+            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
+        )

        image_input = self.prepare_image_inputs()

@@ -95,8 +104,11 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def test_processor(self):
        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
+        video_processor = self.get_video_processor()

-        processor = Qwen2_5_VLProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        processor = Qwen2_5_VLProcessor(
+            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
+        )

        input_str = "lower newer"
        image_input = self.prepare_image_inputs()
@@ -118,8 +130,11 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def test_model_input_names(self):
        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
+        video_processor = self.get_video_processor()

-        processor = Qwen2_5_VLProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        processor = Qwen2_5_VLProcessor(
+            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
+        )

        input_str = "lower newer"
        image_input = self.prepare_image_inputs()
@@ -130,6 +145,7 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertListEqual(list(inputs.keys()), processor.model_input_names)

    @require_torch
+    @require_av
    def _test_apply_chat_template(
        self,
        modality: str,
@@ -212,7 +228,10 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertTrue(input_name in out_dict)
        self.assertEqual(len(out_dict["input_ids"]), batch_size)
        self.assertEqual(len(out_dict["attention_mask"]), batch_size)
-        self.assertEqual(len(out_dict[input_name]), batch_size * 192)
+
+        video_len = 360 if batch_size == 1 else 320  # qwen pixels don't scale with bs same way as other models
+        mm_len = batch_size * 192 if modality == "image" else video_len
+        self.assertEqual(len(out_dict[input_name]), mm_len)

        return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list}
        for k in out_dict:
@@ -394,7 +413,7 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
-            return_tensors="np",
+            return_tensors="pt",
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)

--- a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
@@ -21,7 +21,7 @@ import requests
 from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
 from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
 from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
+from transformers.utils import is_torch_available, is_vision_available

 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs, prepare_video_inputs

@@ -34,8 +34,8 @@ if is_vision_available():

    from transformers import Qwen2VLImageProcessor

-    if is_torchvision_available():
-        from transformers import Qwen2VLImageProcessorFast
+    # if is_torchvision_available():
+    #     from transformers import Qwen2VLImageProcessorFast


 class Qwen2VLImageProcessingTester:
@@ -118,7 +118,7 @@ class Qwen2VLImageProcessingTester:
@require_vision
 class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = Qwen2VLImageProcessor if is_vision_available() else None
-    fast_image_processing_class = Qwen2VLImageProcessorFast if is_torchvision_available() else None
+    # fast_image_processing_class = Qwen2VLImageProcessorFast if is_torchvision_available() else None

    def setUp(self):
        super().setUp()
--- a/tests/models/qwen2_vl/test_processor_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_processor_qwen2_vl.py
@@ -23,7 +23,7 @@ from huggingface_hub import hf_hub_download

 from transformers import AutoProcessor, Qwen2Tokenizer
 from transformers.testing_utils import require_av, require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available

 from ...test_processing_common import ProcessorTesterMixin

@@ -31,6 +31,9 @@ from ...test_processing_common import ProcessorTesterMixin
 if is_vision_available():
    from transformers import Qwen2VLImageProcessor, Qwen2VLProcessor

+    if is_torchvision_available():
+        from transformers import Qwen2VLVideoProcessor
+
 if is_torch_available():
    import torch

@@ -55,6 +58,9 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def get_image_processor(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor

+    def get_video_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor
+
    @staticmethod
    def prepare_processor_dict():
        return {"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"}  # fmt: skip
@@ -66,8 +72,11 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def test_save_load_pretrained_default(self):
        tokenizer = self.get_tokenizer()
        image_processor = self.get_image_processor()
+        video_processor = self.get_video_processor()

-        processor = Qwen2VLProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        processor = Qwen2VLProcessor(
+            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
+        )
        processor.save_pretrained(self.tmpdirname)
        processor = Qwen2VLProcessor.from_pretrained(self.tmpdirname, use_fast=False)

@@ -75,12 +84,16 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
        self.assertIsInstance(processor.tokenizer, Qwen2Tokenizer)
        self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessor)
+        self.assertIsInstance(processor.video_processor, Qwen2VLVideoProcessor)

    def test_image_processor(self):
        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
+        video_processor = self.get_video_processor()

-        processor = Qwen2VLProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        processor = Qwen2VLProcessor(
+            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
+        )

        image_input = self.prepare_image_inputs()

@@ -93,8 +106,11 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def test_processor(self):
        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
+        video_processor = self.get_video_processor()

-        processor = Qwen2VLProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        processor = Qwen2VLProcessor(
+            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
+        )

        input_str = "lower newer"
        image_input = self.prepare_image_inputs()
@@ -113,8 +129,11 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def test_model_input_names(self):
        image_processor = self.get_image_processor()
        tokenizer = self.get_tokenizer()
+        video_processor = self.get_video_processor()

-        processor = Qwen2VLProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        processor = Qwen2VLProcessor(
+            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
+        )

        input_str = "lower newer"
        image_input = self.prepare_image_inputs()
@@ -125,6 +144,7 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertListEqual(list(inputs.keys()), processor.model_input_names)

    @require_torch
+    @require_av
    def _test_apply_chat_template(
        self,
        modality: str,
@@ -207,7 +227,10 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertTrue(input_name in out_dict)
        self.assertEqual(len(out_dict["input_ids"]), batch_size)
        self.assertEqual(len(out_dict["attention_mask"]), batch_size)
-        self.assertEqual(len(out_dict[input_name]), batch_size * 192)
+
+        video_len = 360 if batch_size == 1 else 320  # qwen pixels don't scale with bs same way as other models
+        mm_len = batch_size * 192 if modality == "image" else video_len
+        self.assertEqual(len(out_dict[input_name]), mm_len)

        return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list}
        for k in out_dict:
@@ -373,7 +396,7 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
-            return_tensors="np",
+            return_tensors="pt",
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)

--- a/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py
@@ -0,0 +1,291 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
+
+from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers.image_utils import get_image_size
+    from transformers.models.qwen2_vl.video_processing_qwen2_vl import smart_resize
+
+    if is_torchvision_available():
+        from transformers import Qwen2VLVideoProcessor
+
+
+class Qwen2VLVideoProcessingTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=5,
+        num_frames=8,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=80,
+        do_resize=True,
+        size=None,
+        do_center_crop=True,
+        crop_size=None,
+        do_normalize=True,
+        image_mean=OPENAI_CLIP_MEAN,
+        image_std=OPENAI_CLIP_STD,
+        do_convert_rgb=True,
+        temporal_patch_size=2,
+        patch_size=14,
+        min_pixels=20 * 20,
+        max_pixels=100 * 100,
+        merge_size=2,
+    ):
+        size = size if size is not None else {"shortest_edge": 20}
+        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_frames = num_frames
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+        self.temporal_patch_size = temporal_patch_size
+        self.patch_size = patch_size
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.merge_size = merge_size
+
+    def prepare_video_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "do_center_crop": self.do_center_crop,
+            "crop_size": self.crop_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+            "temporal_patch_size": self.temporal_patch_size,
+            "patch_size": self.patch_size,
+            "min_pixels": self.min_pixels,
+            "max_pixels": self.max_pixels,
+            "merge_size": self.merge_size,
+        }
+
+    @require_vision
+    def expected_output_video_shape(self, videos):
+        grid_t = self.num_frames // self.temporal_patch_size
+        hidden_dim = self.num_channels * self.temporal_patch_size * self.patch_size * self.patch_size
+        seq_len = 0
+        for video in videos:
+            if isinstance(video[0], Image.Image):
+                video = np.stack([np.array(frame) for frame in video])
+            height, width = get_image_size(video)
+            resized_height, resized_width = smart_resize(
+                height,
+                width,
+                factor=self.patch_size * self.merge_size,
+                min_pixels=self.min_pixels,
+                max_pixels=self.max_pixels,
+            )
+            grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
+            seq_len += grid_t * grid_h * grid_w
+        return [seq_len, hidden_dim]
+
+    def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
+        videos = prepare_video_inputs(
+            batch_size=self.batch_size,
+            num_frames=self.num_frames,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            return_tensors=return_tensors,
+        )
+        return videos
+
+
+@require_torch
+@require_vision
+class Qwen2VLVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
+    fast_video_processing_class = Qwen2VLVideoProcessor if is_torchvision_available() else None
+
+    def setUp(self):
+        super().setUp()
+        self.video_processor_tester = Qwen2VLVideoProcessingTester(self)
+
+    @property
+    def video_processor_dict(self):
+        return self.video_processor_tester.prepare_video_processor_dict()
+
+    def test_video_processor_properties(self):
+        video_processing = self.fast_video_processing_class(**self.video_processor_dict)
+        self.assertTrue(hasattr(video_processing, "do_resize"))
+        self.assertTrue(hasattr(video_processing, "size"))
+        self.assertTrue(hasattr(video_processing, "do_center_crop"))
+        self.assertTrue(hasattr(video_processing, "center_crop"))
+        self.assertTrue(hasattr(video_processing, "do_normalize"))
+        self.assertTrue(hasattr(video_processing, "image_mean"))
+        self.assertTrue(hasattr(video_processing, "image_std"))
+        self.assertTrue(hasattr(video_processing, "do_convert_rgb"))
+
+    # OVERRIDEN BECAUSE QWEN2_VL HAS SPECIAL OUTPUT SHAPES
+    def test_video_processor_from_dict_with_kwargs(self):
+        for video_processing_class in self.video_processor_list:
+            video_processor = video_processing_class(**self.video_processor_dict)
+            self.assertEqual(video_processor.min_pixels, self.video_processor_tester.min_pixels)
+            self.assertEqual(video_processor.max_pixels, self.video_processor_tester.max_pixels)
+
+            video_processor = video_processing_class.from_dict(
+                self.video_processor_dict, min_pixels=256 * 256, max_pixels=640 * 640
+            )
+            self.assertEqual(video_processor.min_pixels, 256 * 256)
+            self.assertEqual(video_processor.max_pixels, 640 * 640)
+
+    def test_call_pil(self):
+        for video_processing_class in self.video_processor_list:
+            # Initialize video_processing
+            video_processing = video_processing_class(**self.video_processor_dict)
+            video_inputs = self.video_processor_tester.prepare_video_inputs(
+                equal_resolution=False, return_tensors="pil"
+            )
+
+            # Each video is a list of PIL Images
+            for video in video_inputs:
+                self.assertIsInstance(video[0], Image.Image)
+
+            # Test not batched input
+            encoded_videos = video_processing(video_inputs[0], return_tensors="pt")[self.input_name]
+            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
+            self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
+
+            # Test batched
+            encoded_videos = video_processing(video_inputs, return_tensors="pt")[self.input_name]
+            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
+            self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
+
+    def test_call_numpy(self):
+        for video_processing_class in self.video_processor_list:
+            # Initialize video_processing
+            video_processing = video_processing_class(**self.video_processor_dict)
+            # create random numpy tensors
+            video_inputs = self.video_processor_tester.prepare_video_inputs(
+                equal_resolution=False, return_tensors="np"
+            )
+            for video in video_inputs:
+                self.assertIsInstance(video, np.ndarray)
+
+            # Test not batched input
+            encoded_videos = video_processing(video_inputs[0], return_tensors="pt")[self.input_name]
+            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
+            self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
+
+            # Test batched
+            encoded_videos = video_processing(video_inputs, return_tensors="pt")[self.input_name]
+            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
+            self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
+
+    def test_call_pytorch(self):
+        for video_processing_class in self.video_processor_list:
+            # Initialize video_processing
+            video_processing = video_processing_class(**self.video_processor_dict)
+            # create random PyTorch tensors
+            video_inputs = self.video_processor_tester.prepare_video_inputs(
+                equal_resolution=False, return_tensors="torch"
+            )
+
+            for video in video_inputs:
+                self.assertIsInstance(video, torch.Tensor)
+
+            # Test not batched input
+            encoded_videos = video_processing(video_inputs[0], return_tensors="pt")[self.input_name]
+            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
+            self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
+
+            # Test batched
+            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
+            encoded_videos = video_processing(video_inputs, return_tensors="pt")[self.input_name]
+            self.assertEqual(
+                list(encoded_videos.shape),
+                expected_output_video_shape,
+            )
+
+    def test_nested_input(self):
+        """Tests that the processor can work with nested list where each video is a list of arrays"""
+        for video_processing_class in self.video_processor_list:
+            video_processing = video_processing_class(**self.video_processor_dict)
+            video_inputs = self.video_processor_tester.prepare_video_inputs(
+                equal_resolution=False, return_tensors="np"
+            )
+
+            # Test not batched input
+            video_inputs_nested = [list(video) for video in video_inputs]
+            encoded_videos = video_processing(video_inputs_nested[0], return_tensors="pt")[self.input_name]
+            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
+            self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
+
+            # Test batched
+            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
+            encoded_videos = video_processing(video_inputs_nested, return_tensors="pt")[self.input_name]
+            self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
+
+    @unittest.skip("Skip for now, the test needs adjustment fo Qwen2VL")
+    def test_call_numpy_4_channels(self):
+        for video_processing_class in self.video_processor_list:
+            # Test that can process videos which have an arbitrary number of channels
+            # Initialize video_processing
+            video_processor = video_processing_class(**self.video_processor_dict)
+
+            # create random numpy tensors
+            self.video_processor_tester.num_channels = 4
+            video_inputs = self.video_processor_tester.prepare_video_inputs(
+                equal_resolution=False, return_tensors="np"
+            )
+
+            # Test not batched input
+            encoded_videos = video_processor(
+                video_inputs[0],
+                return_tensors="pt",
+                input_data_format="channels_last",
+                image_mean=0,
+                image_std=1,
+            )[self.input_name]
+            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
+            self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
+
+            # Test batched
+            encoded_videos = video_processor(
+                video_inputs,
+                return_tensors="pt",
+                input_data_format="channels_last",
+                image_mean=0,
+                image_std=1,
+            )[self.input_name]
+            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
+            self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
--- a/tests/models/smolvlm/test_processor_smolvlm.py
+++ b/tests/models/smolvlm/test_processor_smolvlm.py
@@ -22,7 +22,7 @@ import requests

 from transformers import SmolVLMProcessor
 from transformers.models.auto.processing_auto import AutoProcessor
-from transformers.testing_utils import require_av, require_torch, require_vision
+from transformers.testing_utils import is_flaky, require_av, require_torch, require_vision
 from transformers.utils import is_vision_available

 from ...test_processing_common import ProcessorTesterMixin
@@ -63,6 +63,7 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        )
        cls.bos_token = processor.tokenizer.bos_token
        cls.image_token = processor.image_token
+        cls.video_token = processor.image_token * 8  # SmolVLM uses image token and repeats it `num_frames` times
        cls.fake_image_token = processor.fake_image_token
        cls.global_img_token = processor.global_image_token

@@ -79,6 +80,9 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def get_image_processor(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor

+    def get_video_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor
+
    def get_processor(self, **kwargs):
        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)

@@ -114,6 +118,10 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def tearDownClass(cls):
        shutil.rmtree(cls.tmpdirname, ignore_errors=True)

+    @is_flaky  # fails 15 out of 100, FIXME @raushan
+    def test_structured_kwargs_nested_from_dict_video(self):
+        super().test_structured_kwargs_nested_from_dict_video()
+
    def test_process_interleaved_images_prompts_no_image_splitting(self):
        processor_components = self.prepare_components()
        processor_components["tokenizer"] = self.get_component("tokenizer", padding_side="left")
@@ -433,10 +441,13 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
        image_processor = self.get_component("image_processor")
+        video_processor = self.get_component("video_processor")
        tokenizer = self.get_component("tokenizer")

        processor_kwargs = self.prepare_processor_dict()
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor, **processor_kwargs)
+        processor = self.processor_class(
+            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor, **processor_kwargs
+        )
        self.skip_processor_without_typed_kwargs(processor)

        input_str = self.prepare_text_inputs(batch_size=2, modality="image")
@@ -556,3 +567,7 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
                padding=True,
                max_length=20,
            )
+
+    @unittest.skip("SmolVLM cannot accept image URL as video frames, because it needs to know video fps and duration")
+    def test_apply_chat_template_video_1(self):
+        pass
--- a/tests/models/smolvlm/test_video_processing_smolvlm.py
+++ b/tests/models/smolvlm/test_video_processing_smolvlm.py
@@ -0,0 +1,118 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from transformers.image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
+
+from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    if is_torchvision_available():
+        from transformers import SmolVLMVideoProcessor
+        from transformers.models.smolvlm.video_processing_smolvlm import get_resize_output_image_size
+
+
+class SmolVLMVideoProcessingTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=5,
+        num_frames=8,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=80,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=IMAGENET_STANDARD_MEAN,
+        image_std=IMAGENET_STANDARD_STD,
+        do_convert_rgb=True,
+    ):
+        size = size if size is not None else {"longest_edge": 20}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_frames = num_frames
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+
+    def prepare_video_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+        }
+
+    def expected_output_video_shape(self, videos):
+        max_height, max_width = 0, 0
+        if not isinstance(videos[0], torch.Tensor):
+            videos = [torch.tensor(np.array(video)).permute(0, -1, -3, -2) for video in videos]
+        for video in videos:
+            height, width = get_resize_output_image_size(video, self.size["longest_edge"])
+            max_height = max(height, max_height)
+            max_width = max(width, max_width)
+        return [self.num_frames, self.num_channels, max_height, max_width]
+
+    def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
+        videos = prepare_video_inputs(
+            batch_size=self.batch_size,
+            num_frames=self.num_frames,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            return_tensors=return_tensors,
+        )
+        return videos
+
+
+@require_torch
+@require_vision
+class SmolVLMVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
+    fast_video_processing_class = SmolVLMVideoProcessor if is_torchvision_available() else None
+    input_name = "pixel_values"
+
+    def setUp(self):
+        super().setUp()
+        self.video_processor_tester = SmolVLMVideoProcessingTester(self)
+
+    @property
+    def video_processor_dict(self):
+        return self.video_processor_tester.prepare_video_processor_dict()
+
+    def test_video_processor_from_dict_with_kwargs(self):
+        video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict)
+        self.assertEqual(video_processor.size, {"longest_edge": 20})
+
+        video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict, size=42)
+        self.assertEqual(video_processor.size, {"height": 42, "width": 42})
--- a/tests/models/video_llava/test_image_processing_video_llava.py
+++ b/tests/models/video_llava/test_image_processing_video_llava.py
@@ -1,327 +0,0 @@
-# Copyright 2024 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from parameterized import parameterized
-
-from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import VideoLlavaImageProcessor
-
-
-class VideoLlavaImageProcessingTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=5,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=80,
-        do_resize=True,
-        size=None,
-        do_center_crop=True,
-        crop_size=None,
-        do_normalize=True,
-        image_mean=OPENAI_CLIP_MEAN,
-        image_std=OPENAI_CLIP_STD,
-        do_convert_rgb=True,
-    ):
-        size = size if size is not None else {"shortest_edge": 20}
-        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_convert_rgb = do_convert_rgb
-
-    def prepare_image_processor_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_center_crop": self.do_center_crop,
-            "crop_size": self.crop_size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_convert_rgb": self.do_convert_rgb,
-        }
-
-    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.expected_output_image_shape
-    def expected_output_image_shape(self, images):
-        return self.num_channels, self.crop_size["height"], self.crop_size["width"]
-
-    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.prepare_image_inputs
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-    def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        images = prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-        # let's simply copy the frames to fake a long video-clip
-        if numpify or torchify:
-            videos = []
-            for image in images:
-                if numpify:
-                    video = image[None, ...].repeat(8, 0)
-                else:
-                    video = image[None, ...].repeat(8, 1, 1, 1)
-                videos.append(video)
-        else:
-            videos = []
-            for pil_image in images:
-                videos.append([pil_image] * 8)
-
-        return videos
-
-
-@require_torch
-@require_vision
-class VideoLlavaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = VideoLlavaImageProcessor if is_vision_available() else None
-
-    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->VideoLlava
-    def setUp(self):
-        super().setUp()
-        self.image_processor_tester = VideoLlavaImageProcessingTester(self)
-
-    @property
-    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.image_processor_dict
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "do_center_crop"))
-        self.assertTrue(hasattr(image_processing, "center_crop"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
-
-    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.test_image_processor_from_dict_with_kwargs
-    def test_image_processor_from_dict_with_kwargs(self):
-        for image_processing_class in self.image_processor_list:
-            image_processor = image_processing_class.from_dict(self.image_processor_dict)
-            self.assertEqual(image_processor.size, {"shortest_edge": 20})
-            self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
-
-            image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
-            self.assertEqual(image_processor.size, {"shortest_edge": 42})
-            self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
-
-    def test_call_pil(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PIL images
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values_images
-        expected_output_image_shape = (1, 3, 18, 18)
-        self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
-
-        # Test batched
-        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values_images
-        expected_output_image_shape = (5, 3, 18, 18)
-        self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
-
-    def test_call_numpy(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = image_processing(images=image_inputs[0], return_tensors="pt").pixel_values_images
-        expected_output_image_shape = (1, 3, 18, 18)
-        self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
-
-        # Test batched
-        encoded_images = image_processing(images=image_inputs, return_tensors="pt").pixel_values_images
-        expected_output_image_shape = (5, 3, 18, 18)
-        self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
-
-    def test_call_numpy_videos(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        video_inputs = self.image_processor_tester.prepare_video_inputs(numpify=True, equal_resolution=True)
-        for video in video_inputs:
-            self.assertIsInstance(video, np.ndarray)
-
-        # Test not batched input
-        encoded_videos = image_processing(images=None, videos=video_inputs[0], return_tensors="pt").pixel_values_videos
-        expected_output_video_shape = (1, 8, 3, 18, 18)
-        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
-
-        # Test batched
-        encoded_videos = image_processing(images=None, videos=video_inputs, return_tensors="pt").pixel_values_videos
-        expected_output_video_shape = (5, 8, 3, 18, 18)
-        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
-
-    def test_call_pil_videos(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # the inputs come in list of lists batched format
-        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True)
-        for video in video_inputs:
-            self.assertIsInstance(video[0], Image.Image)
-
-        # Test not batched input
-        encoded_videos = image_processing(images=None, videos=video_inputs[0], return_tensors="pt").pixel_values_videos
-        expected_output_video_shape = (1, 8, 3, 18, 18)
-        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
-
-        # Test batched
-        encoded_videos = image_processing(images=None, videos=video_inputs, return_tensors="pt").pixel_values_videos
-        expected_output_video_shape = (5, 8, 3, 18, 18)
-        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
-
-    def test_call_pytorch(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PyTorch tensors
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
-
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values_images
-        expected_output_image_shape = (1, 3, 18, 18)
-        self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
-
-        # Test batched
-        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values_images
-        expected_output_image_shape = (5, 3, 18, 18)
-        self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
-
-    def test_call_pytorch_videos(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PyTorch tensors
-        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True, torchify=True)
-        for video in video_inputs:
-            self.assertIsInstance(video, torch.Tensor)
-
-        # Test not batched input
-        encoded_videos = image_processing(images=None, videos=video_inputs[0], return_tensors="pt").pixel_values_videos
-        expected_output_video_shape = (1, 8, 3, 18, 18)
-        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
-
-        # Test batched
-        encoded_videos = image_processing(images=None, videos=video_inputs, return_tensors="pt").pixel_values_videos
-        expected_output_video_shape = (5, 8, 3, 18, 18)
-        self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
-
-    @parameterized.expand([(True, False), (False, True)])
-    def test_call_mixed(self, numpify, torchify):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        image_inputs = self.image_processor_tester.prepare_image_inputs(
-            equal_resolution=True, numpify=numpify, torchify=torchify
-        )
-        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True, torchify=torchify)
-
-        # Test not batched input
-        encoded = image_processing(images=image_inputs[0], videos=video_inputs[0], return_tensors="pt")
-        expected_output_video_shape = (1, 8, 3, 18, 18)
-        expected_output_image_shape = (1, 3, 18, 18)
-        self.assertEqual(tuple(encoded.pixel_values_videos.shape), expected_output_video_shape)
-        self.assertEqual(tuple(encoded.pixel_values_images.shape), expected_output_image_shape)
-
-        # Test batched
-        encoded = image_processing(images=image_inputs, videos=video_inputs, return_tensors="pt")
-        expected_output_video_shape = (5, 8, 3, 18, 18)
-        expected_output_image_shape = (5, 3, 18, 18)
-        self.assertEqual(tuple(encoded.pixel_values_videos.shape), expected_output_video_shape)
-        self.assertEqual(tuple(encoded.pixel_values_images.shape), expected_output_image_shape)
-
-    def test_call_numpy_4_channels(self):
-        # Test that can process images which have an arbitrary number of channels
-        # Initialize image_processing
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-
-        # create random numpy tensors
-        self.image_processor_tester.num_channels = 4
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
-
-        # Test not batched input
-        encoded_images = image_processor(
-            image_inputs[0],
-            return_tensors="pt",
-            input_data_format="channels_last",
-            image_mean=0,
-            image_std=1,
-        ).pixel_values_images
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
-        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
-
-        # Test batched
-        encoded_images = image_processor(
-            image_inputs,
-            return_tensors="pt",
-            input_data_format="channels_last",
-            image_mean=0,
-            image_std=1,
-        ).pixel_values_images
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
-        self.assertEqual(
-            tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
-        )
--- a/tests/models/video_llava/test_video_processing_video_llava.py
+++ b/tests/models/video_llava/test_video_processing_video_llava.py
@@ -0,0 +1,122 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
+
+from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs
+
+
+if is_torch_available():
+    pass
+
+if is_vision_available():
+    if is_torchvision_available():
+        from transformers import VideoLlavaVideoProcessor
+
+
+class VideoLlavaVideoProcessingTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=5,
+        num_frames=8,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=80,
+        do_resize=True,
+        size=None,
+        do_center_crop=True,
+        crop_size=None,
+        do_normalize=True,
+        image_mean=OPENAI_CLIP_MEAN,
+        image_std=OPENAI_CLIP_STD,
+        do_convert_rgb=True,
+    ):
+        super().__init__()
+        size = size if size is not None else {"shortest_edge": 20}
+        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_frames = num_frames
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+
+    def prepare_video_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_center_crop": self.do_center_crop,
+            "crop_size": self.crop_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+        }
+
+    def expected_output_video_shape(self, images):
+        return self.num_frames, self.num_channels, self.crop_size["height"], self.crop_size["width"]
+
+    def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
+        videos = prepare_video_inputs(
+            batch_size=self.batch_size,
+            num_frames=self.num_frames,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            return_tensors=return_tensors,
+        )
+
+        return videos
+
+
+@require_torch
+@require_vision
+class VideoLlavaVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
+    fast_video_processing_class = VideoLlavaVideoProcessor if is_torchvision_available() else None
+
+    def setUp(self):
+        super().setUp()
+        self.video_processor_tester = VideoLlavaVideoProcessingTester(self)
+
+    @property
+    def video_processor_dict(self):
+        return self.video_processor_tester.prepare_video_processor_dict()
+
+    def test_video_processor_properties(self):
+        video_processing = self.fast_video_processing_class(**self.video_processor_dict)
+        self.assertTrue(hasattr(video_processing, "do_resize"))
+        self.assertTrue(hasattr(video_processing, "size"))
+        self.assertTrue(hasattr(video_processing, "do_center_crop"))
+        self.assertTrue(hasattr(video_processing, "center_crop"))
+        self.assertTrue(hasattr(video_processing, "do_normalize"))
+        self.assertTrue(hasattr(video_processing, "image_mean"))
+        self.assertTrue(hasattr(video_processing, "image_std"))
+        self.assertTrue(hasattr(video_processing, "do_convert_rgb"))
--- a/tests/test_image_processing_common.py
+++ b/tests/test_image_processing_common.py
@@ -179,7 +179,7 @@ class ImageProcessingTestMixin:

        encoding_slow = image_processor_slow(dummy_image, return_tensors="pt")
        encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
-        self.assertTrue(torch.allclose(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-1))
+        torch.testing.assert_close(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-1, rtol=1e-3)
        self.assertLessEqual(
            torch.mean(torch.abs(encoding_slow.pixel_values - encoding_fast.pixel_values)).item(), 5e-3
        )
@@ -205,7 +205,7 @@ class ImageProcessingTestMixin:
        encoding_slow = image_processor_slow(dummy_images, return_tensors="pt")
        encoding_fast = image_processor_fast(dummy_images, return_tensors="pt")

-        self.assertTrue(torch.allclose(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-1))
+        torch.testing.assert_close(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-1, rtol=1e-3)
        self.assertLessEqual(
            torch.mean(torch.abs(encoding_slow.pixel_values - encoding_fast.pixel_values)).item(), 5e-3
        )
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -539,7 +539,7 @@ class ProcessorTesterMixin:
        video_input = self.prepare_video_inputs()

        inputs = processor(text=input_str, videos=video_input, return_tensors="pt")
-        self.assertLessEqual(inputs[self.videos_input_name][0][0][0].mean(), 0)
+        self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)

    def test_kwargs_overrides_default_tokenizer_kwargs_video(self):
        if "video_processor" not in self.processor_class.attributes:
@@ -574,7 +574,7 @@ class ProcessorTesterMixin:
        video_input = self.prepare_video_inputs()

        inputs = processor(text=input_str, videos=video_input, do_rescale=True, rescale_factor=-1, return_tensors="pt")
-        self.assertLessEqual(inputs[self.videos_input_name][0][0][0].mean(), 0)
+        self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)

    def test_unstructured_kwargs_video(self):
        if "video_processor" not in self.processor_class.attributes:
@@ -596,7 +596,7 @@ class ProcessorTesterMixin:
            max_length=76,
        )

-        self.assertLessEqual(inputs[self.videos_input_name][0][0][0].mean(), 0)
+        self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
        self.assertEqual(inputs[self.text_input_name].shape[-1], 76)

    def test_unstructured_kwargs_batched_video(self):
@@ -619,7 +619,7 @@ class ProcessorTesterMixin:
            max_length=76,
        )

-        self.assertLessEqual(inputs[self.videos_input_name][0][0][0].mean(), 0)
+        self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
        self.assertTrue(
            len(inputs[self.text_input_name][0]) == len(inputs[self.text_input_name][1])
            and len(inputs[self.text_input_name][1]) < 76
@@ -665,7 +665,7 @@ class ProcessorTesterMixin:
        inputs = processor(text=input_str, videos=video_input, **all_kwargs)
        self.skip_processor_without_typed_kwargs(processor)

-        self.assertLessEqual(inputs[self.videos_input_name][0][0][0].mean(), 0)
+        self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
        self.assertEqual(inputs[self.text_input_name].shape[-1], 76)

    def test_structured_kwargs_nested_from_dict_video(self):
@@ -686,7 +686,7 @@ class ProcessorTesterMixin:
        }

        inputs = processor(text=input_str, videos=video_input, **all_kwargs)
-        self.assertLessEqual(inputs[self.videos_input_name][0][0][0].mean(), 0)
+        self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
        self.assertEqual(inputs[self.text_input_name].shape[-1], 76)

    # TODO: the same test, but for audio + text processors that have strong overlap in kwargs
@@ -907,15 +907,15 @@ class ProcessorTesterMixin:
        for prompt in continue_prompt:
            self.assertTrue(prompt.endswith("It is the sound of"))  # no `eos` token at the end

-    @require_av
+    @require_librosa
    @parameterized.expand([(1, "np"), (1, "pt"), (2, "np"), (2, "pt")])
    def test_apply_chat_template_audio(self, batch_size: int, return_tensors: str):
        self._test_apply_chat_template(
            "audio", batch_size, return_tensors, "audio_input_name", "feature_extracttor", MODALITY_INPUT_DATA["audio"]
        )

-    @require_librosa
-    @parameterized.expand([(1, "np"), (1, "pt"), (2, "np"), (2, "pt")])
+    @require_av
+    @parameterized.expand([(1, "pt"), (2, "pt")])  # video processor suports only torchvision
    def test_apply_chat_template_video(self, batch_size: int, return_tensors: str):
        self._test_apply_chat_template(
            "video", batch_size, return_tensors, "videos_input_name", "video_processor", MODALITY_INPUT_DATA["videos"]
@@ -927,6 +927,7 @@ class ProcessorTesterMixin:
            "image", batch_size, return_tensors, "images_input_name", "image_processor", MODALITY_INPUT_DATA["images"]
        )

+    @require_torch
    def test_apply_chat_template_video_frame_sampling(self):
        processor = self.get_processor()

@@ -962,7 +963,7 @@ class ProcessorTesterMixin:
            tokenize=True,
            return_dict=True,
            num_frames=num_frames,
-            return_tensors="np",
+            return_tensors="pt",
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
@@ -976,7 +977,7 @@ class ProcessorTesterMixin:
            tokenize=True,
            return_dict=True,
            video_fps=video_fps,
-            return_tensors="np",
+            return_tensors="pt",
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
@@ -1024,6 +1025,7 @@ class ProcessorTesterMixin:
        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 2)

    @require_av
+    @require_torch
    def test_apply_chat_template_video_special_processing(self):
        """
        Tests that models can use their own preprocessing to preprocess conversations.
@@ -1081,7 +1083,7 @@ class ProcessorTesterMixin:
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
-            return_tensors="np",
+            return_tensors="pt",
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)

--- a/tests/test_video_processing_common.py
+++ b/tests/test_video_processing_common.py
@@ -0,0 +1,395 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import json
+import os
+import tempfile
+import warnings
+
+import numpy as np
+from packaging import version
+
+from transformers import AutoVideoProcessor
+from transformers.testing_utils import (
+    check_json_file_has_correct_format,
+    require_torch,
+    require_torch_gpu,
+    require_vision,
+    slow,
+    torch_device,
+)
+from transformers.utils import is_torch_available, is_vision_available
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+
+def prepare_video(num_frames, num_channels, width=10, height=10, return_tensors="pil"):
+    """This function prepares a video as a list of PIL images/NumPy arrays/PyTorch tensors."""
+
+    video = []
+    for i in range(num_frames):
+        video.append(np.random.randint(255, size=(width, height, num_channels), dtype=np.uint8))
+
+    if return_tensors == "pil":
+        # PIL expects the channel dimension as last dimension
+        video = [Image.fromarray(frame) for frame in video]
+    elif return_tensors == "torch":
+        # Torch images are typically in channels first format
+        video = torch.tensor(video).permute(0, 3, 1, 2)
+    elif return_tensors == "np":
+        # Numpy images are typically in channels last format
+        video = np.array(video)
+
+    return video
+
+
+def prepare_video_inputs(
+    batch_size,
+    num_frames,
+    num_channels,
+    min_resolution,
+    max_resolution,
+    equal_resolution=False,
+    return_tensors="pil",
+):
+    """This function prepares a batch of videos: a list of list of PIL images, or a list of list of numpy arrays if
+    one specifies return_tensors="np", or a list of list of PyTorch tensors if one specifies return_tensors="torch".
+
+    One can specify whether the videos are of the same resolution or not.
+    """
+
+    video_inputs = []
+    for i in range(batch_size):
+        if equal_resolution:
+            width = height = max_resolution
+        else:
+            width, height = np.random.choice(np.arange(min_resolution, max_resolution), 2)
+        video = prepare_video(
+            num_frames=num_frames,
+            num_channels=num_channels,
+            width=width,
+            height=height,
+            return_tensors=return_tensors,
+        )
+        video_inputs.append(video)
+
+    return video_inputs
+
+
+class VideoProcessingTestMixin:
+    test_cast_dtype = None
+    fast_video_processing_class = None
+    video_processor_list = None
+    input_name = "pixel_values_videos"
+
+    def setUp(self):
+        video_processor_list = []
+
+        if self.fast_video_processing_class:
+            video_processor_list.append(self.fast_video_processing_class)
+
+        self.video_processor_list = video_processor_list
+
+    def test_video_processor_to_json_string(self):
+        for video_processing_class in self.video_processor_list:
+            video_processor = video_processing_class(**self.video_processor_dict)
+            obj = json.loads(video_processor.to_json_string())
+            for key, value in self.video_processor_dict.items():
+                self.assertEqual(obj[key], value)
+
+    def test_video_processor_to_json_file(self):
+        for video_processing_class in self.video_processor_list:
+            video_processor_first = video_processing_class(**self.video_processor_dict)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                json_file_path = os.path.join(tmpdirname, "video_processor.json")
+                video_processor_first.to_json_file(json_file_path)
+                video_processor_second = video_processing_class.from_json_file(json_file_path)
+
+            self.assertEqual(video_processor_second.to_dict(), video_processor_first.to_dict())
+
+    def test_video_processor_from_dict_with_kwargs(self):
+        video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict)
+        self.assertEqual(video_processor.size, {"shortest_edge": 20})
+        self.assertEqual(video_processor.crop_size, {"height": 18, "width": 18})
+
+        video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict, size=42, crop_size=84)
+        self.assertEqual(video_processor.size, {"shortest_edge": 42})
+        self.assertEqual(video_processor.crop_size, {"height": 84, "width": 84})
+
+    def test_video_processor_from_and_save_pretrained(self):
+        for video_processing_class in self.video_processor_list:
+            video_processor_first = video_processing_class(**self.video_processor_dict)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                saved_file = video_processor_first.save_pretrained(tmpdirname)[0]
+                check_json_file_has_correct_format(saved_file)
+                video_processor_second = video_processing_class.from_pretrained(tmpdirname)
+
+            self.assertEqual(video_processor_second.to_dict(), video_processor_first.to_dict())
+
+    def test_video_processor_save_load_with_autovideoprocessor(self):
+        for video_processing_class in self.video_processor_list:
+            video_processor_first = video_processing_class(**self.video_processor_dict)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                saved_file = video_processor_first.save_pretrained(tmpdirname)[0]
+                check_json_file_has_correct_format(saved_file)
+
+                use_fast = video_processing_class.__name__.endswith("Fast")
+                video_processor_second = AutoVideoProcessor.from_pretrained(tmpdirname, use_fast=use_fast)
+
+            self.assertEqual(video_processor_second.to_dict(), video_processor_first.to_dict())
+
+    def test_init_without_params(self):
+        for video_processing_class in self.video_processor_list:
+            video_processor = video_processing_class()
+            self.assertIsNotNone(video_processor)
+
+    @slow
+    @require_torch_gpu
+    @require_vision
+    def test_can_compile_fast_video_processor(self):
+        if self.fast_video_processing_class is None:
+            self.skipTest("Skipping compilation test as fast video processor is not defined")
+        if version.parse(torch.__version__) < version.parse("2.3"):
+            self.skipTest(reason="This test requires torch >= 2.3 to run.")
+
+        torch.compiler.reset()
+        video_inputs = self.video_processor_tester.prepare_video_inputs(equal_resolution=False, return_tensors="torch")
+        video_processor = self.fast_video_processing_class(**self.video_processor_dict)
+        output_eager = video_processor(video_inputs, device=torch_device, return_tensors="pt")
+
+        video_processor = torch.compile(video_processor, mode="reduce-overhead")
+        output_compiled = video_processor(video_inputs, device=torch_device, return_tensors="pt")
+
+        torch.testing.assert_close(
+            output_eager[self.input_name], output_compiled[self.input_name], rtol=1e-4, atol=1e-4
+        )
+
+    @require_torch
+    @require_vision
+    def test_cast_dtype_device(self):
+        for video_processing_class in self.video_processor_list:
+            if self.test_cast_dtype is not None:
+                # Initialize video_processor
+                video_processor = video_processing_class(**self.video_processor_dict)
+
+                # create random PyTorch tensors
+                video_inputs = self.video_processor_tester.prepare_video_inputs(
+                    equal_resolution=False, return_tensors="torch"
+                )
+
+                encoding = video_processor(video_inputs, return_tensors="pt")
+
+                self.assertEqual(encoding[self.input_name].device, torch.device("cpu"))
+                self.assertEqual(encoding[self.input_name].dtype, torch.float32)
+
+                encoding = video_processor(video_inputs, return_tensors="pt").to(torch.float16)
+                self.assertEqual(encoding[self.input_name].device, torch.device("cpu"))
+                self.assertEqual(encoding[self.input_name].dtype, torch.float16)
+
+                encoding = video_processor(video_inputs, return_tensors="pt").to("cpu", torch.bfloat16)
+                self.assertEqual(encoding[self.input_name].device, torch.device("cpu"))
+                self.assertEqual(encoding[self.input_name].dtype, torch.bfloat16)
+
+                with self.assertRaises(TypeError):
+                    _ = video_processor(video_inputs, return_tensors="pt").to(torch.bfloat16, "cpu")
+
+                # Try with text + video feature
+                encoding = video_processor(video_inputs, return_tensors="pt")
+                encoding.update({"input_ids": torch.LongTensor([[1, 2, 3], [4, 5, 6]])})
+                encoding = encoding.to(torch.float16)
+
+                self.assertEqual(encoding[self.input_name].device, torch.device("cpu"))
+                self.assertEqual(encoding[self.input_name].dtype, torch.float16)
+                self.assertEqual(encoding.input_ids.dtype, torch.long)
+
+    def test_call_pil(self):
+        for video_processing_class in self.video_processor_list:
+            # Initialize video_processing
+            video_processing = video_processing_class(**self.video_processor_dict)
+            video_inputs = self.video_processor_tester.prepare_video_inputs(equal_resolution=False)
+
+            # Each video is a list of PIL Images
+            for video in video_inputs:
+                self.assertIsInstance(video[0], Image.Image)
+
+            # Test not batched input
+            encoded_videos = video_processing(video_inputs[0], return_tensors="pt")[self.input_name]
+            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
+            self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
+
+            # Test batched
+            encoded_videos = video_processing(video_inputs, return_tensors="pt")[self.input_name]
+            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
+            self.assertEqual(
+                tuple(encoded_videos.shape), (self.video_processor_tester.batch_size, *expected_output_video_shape)
+            )
+
+    def test_call_numpy(self):
+        for video_processing_class in self.video_processor_list:
+            # Initialize video_processing
+            video_processing = video_processing_class(**self.video_processor_dict)
+            # create random numpy tensors
+            video_inputs = self.video_processor_tester.prepare_video_inputs(
+                equal_resolution=False, return_tensors="np"
+            )
+            for video in video_inputs:
+                self.assertIsInstance(video, np.ndarray)
+
+            # Test not batched input
+            encoded_videos = video_processing(video_inputs[0], return_tensors="pt")[self.input_name]
+            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
+            self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
+
+            # Test batched
+            encoded_videos = video_processing(video_inputs, return_tensors="pt")[self.input_name]
+            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
+            self.assertEqual(
+                tuple(encoded_videos.shape), (self.video_processor_tester.batch_size, *expected_output_video_shape)
+            )
+
+    def test_call_pytorch(self):
+        for video_processing_class in self.video_processor_list:
+            # Initialize video_processing
+            video_processing = video_processing_class(**self.video_processor_dict)
+            # create random PyTorch tensors
+            video_inputs = self.video_processor_tester.prepare_video_inputs(
+                equal_resolution=False, return_tensors="torch"
+            )
+
+            for video in video_inputs:
+                self.assertIsInstance(video, torch.Tensor)
+
+            # Test not batched input
+            encoded_videos = video_processing(video_inputs[0], return_tensors="pt")[self.input_name]
+            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
+            self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
+
+            # Test batched
+            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
+            encoded_videos = video_processing(video_inputs, return_tensors="pt")[self.input_name]
+            self.assertEqual(
+                tuple(encoded_videos.shape),
+                (self.video_processor_tester.batch_size, *expected_output_video_shape),
+            )
+
+    def test_nested_input(self):
+        """Tests that the processor can work with nested list where each video is a list of arrays"""
+        for video_processing_class in self.video_processor_list:
+            video_processing = video_processing_class(**self.video_processor_dict)
+            video_inputs = self.video_processor_tester.prepare_video_inputs(
+                equal_resolution=False, return_tensors="np"
+            )
+
+            # Test not batched input
+            video_inputs = [list(video) for video in video_inputs]
+            encoded_videos = video_processing(video_inputs[0], return_tensors="pt")[self.input_name]
+            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
+            self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
+
+            # Test batched
+            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
+            encoded_videos = video_processing(video_inputs, return_tensors="pt")[self.input_name]
+            self.assertEqual(
+                tuple(encoded_videos.shape),
+                (self.video_processor_tester.batch_size, *expected_output_video_shape),
+            )
+
+    def test_call_numpy_4_channels(self):
+        for video_processing_class in self.video_processor_list:
+            # Test that can process videos which have an arbitrary number of channels
+            # Initialize video_processing
+            video_processor = video_processing_class(**self.video_processor_dict)
+
+            # create random numpy tensors
+            self.video_processor_tester.num_channels = 4
+            video_inputs = self.video_processor_tester.prepare_video_inputs(
+                equal_resolution=False, return_tensors="pil"
+            )
+
+            # Test not batched input
+            encoded_videos = video_processor(
+                video_inputs[0],
+                return_tensors="pt",
+                input_data_format="channels_last",
+                image_mean=0,
+                image_std=1,
+            )[self.input_name]
+            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
+            if video_processor.do_convert_rgb:
+                expected_output_video_shape = list(expected_output_video_shape)
+                expected_output_video_shape[1] = 3
+            self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
+
+            # Test batched
+            encoded_videos = video_processor(
+                video_inputs,
+                return_tensors="pt",
+                input_data_format="channels_last",
+                image_mean=0,
+                image_std=1,
+            )[self.input_name]
+            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
+            if video_processor.do_convert_rgb:
+                expected_output_video_shape = list(expected_output_video_shape)
+                expected_output_video_shape[1] = 3
+            self.assertEqual(
+                tuple(encoded_videos.shape), (self.video_processor_tester.batch_size, *expected_output_video_shape)
+            )
+
+    def test_video_processor_preprocess_arguments(self):
+        is_tested = False
+
+        for video_processing_class in self.video_processor_list:
+            video_processor = video_processing_class(**self.video_processor_dict)
+
+            # validation done by _valid_processor_keys attribute
+            if hasattr(video_processor, "_valid_processor_keys") and hasattr(video_processor, "preprocess"):
+                preprocess_parameter_names = inspect.getfullargspec(video_processor.preprocess).args
+                preprocess_parameter_names.remove("self")
+                preprocess_parameter_names.sort()
+                valid_processor_keys = video_processor._valid_processor_keys
+                valid_processor_keys.sort()
+                self.assertEqual(preprocess_parameter_names, valid_processor_keys)
+                is_tested = True
+
+            # validation done by @filter_out_non_signature_kwargs decorator
+            if hasattr(video_processor.preprocess, "_filter_out_non_signature_kwargs"):
+                if hasattr(self.video_processor_tester, "prepare_video_inputs"):
+                    inputs = self.video_processor_tester.prepare_video_inputs()
+                elif hasattr(self.video_processor_tester, "prepare_video_inputs"):
+                    inputs = self.video_processor_tester.prepare_video_inputs()
+                else:
+                    self.skipTest(reason="No valid input preparation method found")
+
+                with warnings.catch_warnings(record=True) as raised_warnings:
+                    warnings.simplefilter("always")
+                    video_processor(inputs, extra_argument=True)
+
+                messages = " ".join([str(w.message) for w in raised_warnings])
+                self.assertGreaterEqual(len(raised_warnings), 1)
+                self.assertIn("extra_argument", messages)
+                is_tested = True
+
+        if not is_tested:
+            self.skipTest(reason="No validation found for `preprocess` method")
--- a/tests/utils/test_image_utils.py
+++ b/tests/utils/test_image_utils.py
@@ -30,7 +30,6 @@ from transformers import is_torch_available, is_vision_available
 from transformers.image_utils import (
    ChannelDimension,
    get_channel_dimension_axis,
-    make_batched_videos,
    make_flat_list_of_images,
    make_list_of_images,
    make_nested_list_of_images,
@@ -396,133 +395,6 @@ class ImageFeatureExtractionTester(unittest.TestCase):
        self.assertEqual(len(images_list[0]), 4)
        self.assertTrue(np.array_equal(images_list[0][0], images[0][0]))

-    def test_make_batched_videos_pil(self):
-        # Test a single image is converted to a list of 1 video with 1 frame
-        pil_image = get_random_image(16, 32)
-        videos_list = make_batched_videos(pil_image)
-        self.assertIsInstance(videos_list[0], list)
-        self.assertEqual(len(videos_list[0]), 1)
-        self.assertIsInstance(videos_list[0][0], PIL.Image.Image)
-
-        # Test a list of images is converted to a list of 1 video
-        images = [get_random_image(16, 32) for _ in range(4)]
-        videos_list = make_batched_videos(images)
-        self.assertIsInstance(videos_list[0], list)
-        self.assertEqual(len(videos_list), 1)
-        self.assertEqual(len(videos_list[0]), 4)
-        self.assertIsInstance(videos_list[0][0], PIL.Image.Image)
-
-        # Test a nested list of images is not modified
-        images = [[get_random_image(16, 32) for _ in range(2)] for _ in range(2)]
-        videos_list = make_nested_list_of_images(images)
-        self.assertIsInstance(videos_list[0], list)
-        self.assertEqual(len(videos_list), 2)
-        self.assertEqual(len(videos_list[0]), 2)
-        self.assertIsInstance(videos_list[0][0], PIL.Image.Image)
-
-    def test_make_batched_videos_numpy(self):
-        # Test a single image is converted to a list of 1 video with 1 frame
-        images = np.random.randint(0, 256, (16, 32, 3))
-        videos_list = make_batched_videos(images)
-        self.assertIsInstance(videos_list[0], list)
-        self.assertEqual(len(videos_list), 1)
-        self.assertTrue(np.array_equal(videos_list[0][0], images))
-
-        # Test a 4d array of images is converted to a list of 1 video
-        images = np.random.randint(0, 256, (4, 16, 32, 3))
-        videos_list = make_batched_videos(images)
-        self.assertIsInstance(videos_list[0], list)
-        self.assertIsInstance(videos_list[0][0], np.ndarray)
-        self.assertEqual(len(videos_list), 1)
-        self.assertEqual(len(videos_list[0]), 4)
-        self.assertTrue(np.array_equal(videos_list[0][0], images[0]))
-
-        # Test a list of images is converted to a list of videos
-        images = [np.random.randint(0, 256, (16, 32, 3)) for _ in range(4)]
-        videos_list = make_batched_videos(images)
-        self.assertIsInstance(videos_list[0], list)
-        self.assertEqual(len(videos_list), 1)
-        self.assertEqual(len(videos_list[0]), 4)
-        self.assertTrue(np.array_equal(videos_list[0][0], images[0]))
-
-        # Test a nested list of images is left unchanged
-        images = [[np.random.randint(0, 256, (16, 32, 3)) for _ in range(2)] for _ in range(2)]
-        videos_list = make_batched_videos(images)
-        self.assertIsInstance(videos_list[0], list)
-        self.assertEqual(len(videos_list), 2)
-        self.assertEqual(len(videos_list[0]), 2)
-        self.assertTrue(np.array_equal(videos_list[0][0], images[0][0]))
-
-        # Test a list of 4d array images is converted to a list of videos
-        images = [np.random.randint(0, 256, (4, 16, 32, 3)) for _ in range(2)]
-        videos_list = make_batched_videos(images)
-        self.assertIsInstance(videos_list[0], list)
-        self.assertIsInstance(videos_list[0][0], np.ndarray)
-        self.assertEqual(len(videos_list), 2)
-        self.assertEqual(len(videos_list[0]), 4)
-        self.assertTrue(np.array_equal(videos_list[0][0], images[0][0]))
-
-        # Test a batch of list of 4d array images is converted to a list of videos
-        images = [[np.random.randint(0, 256, (4, 16, 32, 3)) for _ in range(2)] for _ in range(2)]
-        videos_list = make_batched_videos(images)
-        self.assertIsInstance(videos_list[0], list)
-        self.assertIsInstance(videos_list[0][0], np.ndarray)
-        self.assertEqual(len(videos_list), 2)
-        self.assertEqual(len(videos_list[0]), 8)
-        self.assertTrue(np.array_equal(videos_list[0][0], images[0][0][0]))
-
-    @require_torch
-    def test_make_batched_videos_torch(self):
-        # Test a single image is converted to a list of 1 video with 1 frame
-        images = torch.randint(0, 256, (16, 32, 3))
-        videos_list = make_batched_videos(images)
-        self.assertIsInstance(videos_list[0], list)
-        self.assertEqual(len(videos_list[0]), 1)
-        self.assertTrue(np.array_equal(videos_list[0][0], images))
-
-        # Test a 4d tensor of images is converted to a list of 1 video
-        images = torch.randint(0, 256, (4, 16, 32, 3))
-        videos_list = make_batched_videos(images)
-        self.assertIsInstance(videos_list[0], list)
-        self.assertIsInstance(videos_list[0][0], torch.Tensor)
-        self.assertEqual(len(videos_list), 1)
-        self.assertEqual(len(videos_list[0]), 4)
-        self.assertTrue(np.array_equal(videos_list[0][0], images[0]))
-
-        # Test a list of images is converted to a list of videos
-        images = [torch.randint(0, 256, (16, 32, 3)) for _ in range(4)]
-        videos_list = make_batched_videos(images)
-        self.assertIsInstance(videos_list[0], list)
-        self.assertEqual(len(videos_list), 1)
-        self.assertEqual(len(videos_list[0]), 4)
-        self.assertTrue(np.array_equal(videos_list[0][0], images[0]))
-
-        # Test a nested list of images is left unchanged
-        images = [[torch.randint(0, 256, (16, 32, 3)) for _ in range(2)] for _ in range(2)]
-        videos_list = make_batched_videos(images)
-        self.assertIsInstance(videos_list[0], list)
-        self.assertEqual(len(videos_list), 2)
-        self.assertEqual(len(videos_list[0]), 2)
-        self.assertTrue(np.array_equal(videos_list[0][0], images[0][0]))
-
-        # Test a list of 4d tensor images is converted to a list of videos
-        images = [torch.randint(0, 256, (4, 16, 32, 3)) for _ in range(2)]
-        videos_list = make_batched_videos(images)
-        self.assertIsInstance(videos_list[0], list)
-        self.assertIsInstance(videos_list[0][0], torch.Tensor)
-        self.assertEqual(len(videos_list), 2)
-        self.assertEqual(len(videos_list[0]), 4)
-        self.assertTrue(np.array_equal(videos_list[0][0], images[0][0]))
-
-        # Test a batch of list of 4d tensor images is converted to a list of videos
-        images = [[torch.randint(0, 256, (4, 16, 32, 3)) for _ in range(2)] for _ in range(2)]
-        videos_list = make_batched_videos(images)
-        self.assertIsInstance(videos_list[0], list)
-        self.assertIsInstance(videos_list[0][0], torch.Tensor)
-        self.assertEqual(len(videos_list), 2)
-        self.assertEqual(len(videos_list[0]), 8)
-        self.assertTrue(np.array_equal(videos_list[0][0], images[0][0][0]))
-
    @require_torch
    def test_conversion_torch_to_array(self):
        feature_extractor = ImageFeatureExtractionMixin()
--- a/tests/utils/test_video_utils.py
+++ b/tests/utils/test_video_utils.py
@@ -0,0 +1,286 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from huggingface_hub import hf_hub_download
+
+from transformers import is_torch_available, is_vision_available
+from transformers.image_processing_utils import get_size_dict
+from transformers.image_utils import SizeDict
+from transformers.processing_utils import VideosKwargs
+from transformers.testing_utils import (
+    require_av,
+    require_cv2,
+    require_decord,
+    require_torch,
+    require_torchvision,
+    require_vision,
+)
+from transformers.video_utils import make_batched_videos
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    import PIL
+
+    from transformers import BaseVideoProcessor
+    from transformers.video_utils import VideoMetadata, load_video
+
+
+def get_random_video(height, width, return_torch=False):
+    random_frame = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
+    video = np.array(([random_frame] * 8))
+    if return_torch:
+        # move channel first
+        return torch.from_numpy(video).permute(0, 3, 1, 2)
+    return video
+
+
+@require_vision
+@require_torchvision
+class BaseVideoProcessorTester(unittest.TestCase):
+    """
+    Tests that the `transforms` can be applied to a 4-dim array directly, i.e. to a whole video.
+    """
+
+    def test_make_batched_videos_pil(self):
+        # Test a single image is converted to a list of 1 video with 1 frame
+        video = get_random_video(16, 32)
+        pil_image = PIL.Image.fromarray(video[0])
+        videos_list = make_batched_videos(pil_image)
+        self.assertIsInstance(videos_list, list)
+        self.assertIsInstance(videos_list[0], np.ndarray)
+        self.assertEqual(videos_list[0].shape, (1, 16, 32, 3))
+        self.assertTrue(np.array_equal(videos_list[0][0], np.array(pil_image)))
+
+        # Test a list of videos is converted to a list of 1 video
+        video = get_random_video(16, 32)
+        video = [PIL.Image.fromarray(frame) for frame in video]
+        videos_list = make_batched_videos(video)
+        self.assertIsInstance(videos_list, list)
+        self.assertIsInstance(videos_list[0], np.ndarray)
+        self.assertEqual(videos_list[0].shape, (8, 16, 32, 3))
+        self.assertTrue(np.array_equal(videos_list[0], video))
+
+        # Test a nested list of videos is not modified
+        video = get_random_video(16, 32)
+        video = [PIL.Image.fromarray(frame) for frame in video]
+        videos = [video, video]
+        videos_list = make_batched_videos(videos)
+        self.assertIsInstance(videos_list, list)
+        self.assertIsInstance(videos_list[0], np.ndarray)
+        self.assertEqual(videos_list[0].shape, (8, 16, 32, 3))
+        self.assertTrue(np.array_equal(videos_list[0], video))
+
+    def test_make_batched_videos_numpy(self):
+        # Test a single image is converted to a list of 1 video with 1 frame
+        video = get_random_video(16, 32)[0]
+        videos_list = make_batched_videos(video)
+        self.assertIsInstance(videos_list, list)
+        self.assertIsInstance(videos_list[0], np.ndarray)
+        self.assertEqual(videos_list[0].shape, (1, 16, 32, 3))
+        self.assertTrue(np.array_equal(videos_list[0][0], video))
+
+        # Test a 4d array of videos is converted to a a list of 1 video
+        video = get_random_video(16, 32)
+        videos_list = make_batched_videos(video)
+        self.assertIsInstance(videos_list, list)
+        self.assertIsInstance(videos_list[0], np.ndarray)
+        self.assertEqual(videos_list[0].shape, (8, 16, 32, 3))
+        self.assertTrue(np.array_equal(videos_list[0], video))
+
+        # Test a list of videos is converted to a list of videos
+        video = get_random_video(16, 32)
+        videos = [video, video]
+        videos_list = make_batched_videos(videos)
+        self.assertIsInstance(videos_list, list)
+        self.assertIsInstance(videos_list[0], np.ndarray)
+        self.assertEqual(videos_list[0].shape, (8, 16, 32, 3))
+        self.assertTrue(np.array_equal(videos_list[0], video))
+
+    @require_torch
+    def test_make_batched_videos_torch(self):
+        # Test a single image is converted to a list of 1 video with 1 frame
+        video = get_random_video(16, 32)[0]
+        torch_video = torch.from_numpy(video)
+        videos_list = make_batched_videos(torch_video)
+        self.assertIsInstance(videos_list, list)
+        self.assertIsInstance(videos_list[0], np.ndarray)
+        self.assertEqual(videos_list[0].shape, (1, 16, 32, 3))
+        self.assertTrue(np.array_equal(videos_list[0][0], video))
+
+        # Test a 4d array of videos is converted to a a list of 1 video
+        video = get_random_video(16, 32)
+        torch_video = torch.from_numpy(video)
+        videos_list = make_batched_videos(torch_video)
+        self.assertIsInstance(videos_list, list)
+        self.assertIsInstance(videos_list[0], torch.Tensor)
+        self.assertEqual(videos_list[0].shape, (8, 16, 32, 3))
+        self.assertTrue(np.array_equal(videos_list[0], video))
+
+        # Test a list of videos is converted to a list of videos
+        video = get_random_video(16, 32)
+        torch_video = torch.from_numpy(video)
+        videos = [torch_video, torch_video]
+        videos_list = make_batched_videos(videos)
+        self.assertIsInstance(videos_list, list)
+        self.assertIsInstance(videos_list[0], torch.Tensor)
+        self.assertEqual(videos_list[0].shape, (8, 16, 32, 3))
+        self.assertTrue(np.array_equal(videos_list[0], video))
+
+    def test_resize(self):
+        video_processor = BaseVideoProcessor(model_init_kwargs=VideosKwargs)
+        video = get_random_video(16, 32, return_torch=True)
+
+        # Size can be an int or a tuple of ints.
+        size_dict = SizeDict(**get_size_dict((8, 8), param_name="size"))
+        resized_video = video_processor.resize(video, size=size_dict)
+        self.assertIsInstance(resized_video, torch.Tensor)
+        self.assertEqual(resized_video.shape, (8, 3, 8, 8))
+
+    def test_normalize(self):
+        video_processor = BaseVideoProcessor(model_init_kwargs=VideosKwargs)
+        array = torch.randn(4, 3, 16, 32)
+        mean = [0.1, 0.5, 0.9]
+        std = [0.2, 0.4, 0.6]
+
+        # mean and std can be passed as lists or NumPy arrays.
+        expected = (array - torch.tensor(mean)[:, None, None]) / torch.tensor(std)[:, None, None]
+        normalized_array = video_processor.normalize(array, mean, std)
+        torch.testing.assert_close(normalized_array, expected)
+
+    def test_center_crop(self):
+        video_processor = BaseVideoProcessor(model_init_kwargs=VideosKwargs)
+        video = get_random_video(16, 32, return_torch=True)
+
+        # Test various crop sizes: bigger on all dimensions, on one of the dimensions only and on both dimensions.
+        crop_sizes = [8, (8, 64), 20, (32, 64)]
+        for size in crop_sizes:
+            size_dict = SizeDict(**get_size_dict(size, default_to_square=True, param_name="crop_size"))
+            cropped_video = video_processor.center_crop(video, size_dict)
+            self.assertIsInstance(cropped_video, torch.Tensor)
+
+            expected_size = (size, size) if isinstance(size, int) else size
+            self.assertEqual(cropped_video.shape, (8, 3, *expected_size))
+
+    def test_convert_to_rgb(self):
+        video_processor = BaseVideoProcessor(model_init_kwargs=VideosKwargs)
+        video = get_random_video(20, 20, return_torch=True)
+
+        rgb_video = video_processor.convert_to_rgb(video[:, :1])
+        self.assertEqual(rgb_video.shape, (8, 3, 20, 20))
+
+        rgb_video = video_processor.convert_to_rgb(torch.cat([video, video[:, :1]], dim=1))
+        self.assertEqual(rgb_video.shape, (8, 3, 20, 20))
+
+
+@require_vision
+@require_av
+class LoadVideoTester(unittest.TestCase):
+    def test_load_video_url(self):
+        video, _ = load_video(
+            "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4",
+        )
+        self.assertEqual(video.shape, (243, 360, 640, 3))  # 243 frames is the whole video, no sampling applied
+
+    def test_load_video_local(self):
+        video_file_path = hf_hub_download(
+            repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
+        )
+        video, _ = load_video(video_file_path)
+        self.assertEqual(video.shape, (243, 360, 640, 3))  # 243 frames is the whole video, no sampling applied
+
+    # FIXME: @raushan, yt-dlp downloading works for for some reason it cannot redirect to out buffer?
+    # @requires_yt_dlp
+    # def test_load_video_youtube(self):
+    #     video = load_video("https://www.youtube.com/watch?v=QC8iQqtG0hg")
+    #     self.assertEqual(video.shape, (243, 360, 640, 3)) # 243 frames is the whole video, no sampling applied
+
+    @require_decord
+    @require_torchvision
+    @require_cv2
+    def test_load_video_backend_url(self):
+        video, _ = load_video(
+            "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4",
+            backend="decord",
+        )
+        self.assertEqual(video.shape, (243, 360, 640, 3))
+
+        # Can't use certain backends with url
+        with self.assertRaises(ValueError):
+            video, _ = load_video(
+                "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4",
+                backend="opencv",
+            )
+        with self.assertRaises(ValueError):
+            video, _ = load_video(
+                "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4",
+                backend="torchvision",
+            )
+
+    @require_decord
+    @require_torchvision
+    @require_cv2
+    def test_load_video_backend_local(self):
+        video_file_path = hf_hub_download(
+            repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
+        )
+        video, metadata = load_video(video_file_path, backend="decord")
+        self.assertEqual(video.shape, (243, 360, 640, 3))
+        self.assertIsInstance(metadata, VideoMetadata)
+
+        video, metadata = load_video(video_file_path, backend="opencv")
+        self.assertEqual(video.shape, (243, 360, 640, 3))
+        self.assertIsInstance(metadata, VideoMetadata)
+
+        video, metadata = load_video(video_file_path, backend="torchvision")
+        self.assertEqual(video.shape, (243, 360, 640, 3))
+        self.assertIsInstance(metadata, VideoMetadata)
+
+    def test_load_video_num_frames(self):
+        video, _ = load_video(
+            "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4",
+            num_frames=16,
+        )
+        self.assertEqual(video.shape, (16, 360, 640, 3))
+
+        video, _ = load_video(
+            "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4",
+            num_frames=22,
+        )
+        self.assertEqual(video.shape, (22, 360, 640, 3))
+
+    def test_load_video_fps(self):
+        video, _ = load_video(
+            "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4", fps=1
+        )
+        self.assertEqual(video.shape, (9, 360, 640, 3))
+
+        video, _ = load_video(
+            "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4", fps=2
+        )
+        self.assertEqual(video.shape, (19, 360, 640, 3))
+
+        # `num_frames` is mutually exclusive with `video_fps`
+        with self.assertRaises(ValueError):
+            video, _ = load_video(
+                "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4",
+                fps=1,
+                num_frames=10,
+            )