[video processor] support torchcodec and decrease cuda memory usage (#38880)

* don't move the whole video to GPU * add torchcodec * add tests * make style * instrucblip as well * consistency * Update src/transformers/utils/import_utils.py Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com> * Update src/transformers/utils/import_utils.py Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com> * Update src/transformers/video_utils.py Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com> --------- Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
2025-06-25 10:23:37 +02:00
parent 11d0feacce
commit e212ff9e6a
10 changed files with 129 additions and 9 deletions
--- a/tests/utils/test_video_utils.py
+++ b/tests/utils/test_video_utils.py
@@ -27,6 +27,7 @@ from transformers.testing_utils import (
    require_cv2,
    require_decord,
    require_torch,
+    require_torchcodec,
    require_torchvision,
    require_vision,
 )
@@ -261,6 +262,7 @@ class LoadVideoTester(unittest.TestCase):

    @require_decord
    @require_torchvision
+    @require_torchcodec
    @require_cv2
    def test_load_video_backend_url(self):
        video, _ = load_video(
@@ -269,6 +271,12 @@ class LoadVideoTester(unittest.TestCase):
        )
        self.assertEqual(video.shape, (243, 360, 640, 3))

+        video, _ = load_video(
+            "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4",
+            backend="torchcodec",
+        )
+        self.assertEqual(video.shape, (243, 360, 640, 3))
+
        # Can't use certain backends with url
        with self.assertRaises(ValueError):
            video, _ = load_video(
@@ -283,6 +291,7 @@ class LoadVideoTester(unittest.TestCase):

    @require_decord
    @require_torchvision
+    @require_torchcodec
    @require_cv2
    def test_load_video_backend_local(self):
        video_file_path = hf_hub_download(
@@ -300,6 +309,10 @@ class LoadVideoTester(unittest.TestCase):
        self.assertEqual(video.shape, (243, 360, 640, 3))
        self.assertIsInstance(metadata, VideoMetadata)

+        video, metadata = load_video(video_file_path, backend="torchcodec")
+        self.assertEqual(video.shape, (243, 360, 640, 3))
+        self.assertIsInstance(metadata, VideoMetadata)
+
    def test_load_video_num_frames(self):
        video, _ = load_video(
            "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4",