Add support for including in-memory videos (not just files/urls) in apply_chat_template (#39494)

* added code for handling video object ,as dictionary of frames and metadata, in chat template

* added new test where videos are passed as objects (dict of frames, metadata) in the chat template

* modified hardcoded video_len check that does not match with increased number of tests cases.

* Modify hardcoded video_len check that fails with increased number of tests

* update documentation of multi-modal chat templating with extra information about including video object in chat template.

* add array handling in load_video()

* temporary test video inlcuded

* skip testing smolvlm with videos that are list of frames

* update documentation & make fixup

* Address review comments
This commit is contained in:
Akib Jawad
2025-08-04 02:49:42 -07:00
committed by GitHub
parent 0d511f7a77
commit 2a9febd632
9 changed files with 106 additions and 16 deletions

View File

@@ -33,7 +33,7 @@ from transformers.testing_utils import (
require_torch,
require_vision,
)
from transformers.utils import is_torch_available, is_vision_available
from transformers.utils import is_av_available, is_torch_available, is_vision_available
global_rng = random.Random()
@@ -44,7 +44,6 @@ if is_vision_available():
if is_torch_available():
import torch
MODALITY_INPUT_DATA = {
"images": [
"http://images.cocodataset.org/val2017/000000039769.jpg",
@@ -60,6 +59,13 @@ MODALITY_INPUT_DATA = {
],
}
if is_av_available():
from transformers.video_utils import load_video
# load a video file in memory for testing
video, _ = load_video("https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4")
MODALITY_INPUT_DATA["videos"].append(video)
def prepare_image_inputs():
"""This function prepares a list of PIL images"""
@@ -931,7 +937,7 @@ class ProcessorTesterMixin:
)
@require_av
@parameterized.expand([(1, "pt"), (2, "pt")]) # video processor supports only torchvision
@parameterized.expand([(1, "pt"), (2, "pt"), (3, "pt")]) # video processor supports only torchvision
def test_apply_chat_template_video(self, batch_size: int, return_tensors: str):
self._test_apply_chat_template(
"video", batch_size, return_tensors, "videos_input_name", "video_processor", MODALITY_INPUT_DATA["videos"]