Add support for including in-memory videos (not just files/urls) in apply_chat_template (#39494)

* added code for handling video object ,as dictionary of frames and metadata, in chat template

* added new test where videos are passed as objects (dict of frames, metadata) in the chat template

* modified hardcoded video_len check that does not match with increased number of tests cases.

* Modify hardcoded video_len check that fails with increased number of tests

* update documentation of multi-modal chat templating with extra information about including video object in chat template.

* add array handling in load_video()

* temporary test video inlcuded

* skip testing smolvlm with videos that are list of frames

* update documentation & make fixup

* Address review comments
This commit is contained in:
Akib Jawad
2025-08-04 02:49:42 -07:00
committed by GitHub
parent 0d511f7a77
commit 2a9febd632
9 changed files with 106 additions and 16 deletions

View File

@@ -111,6 +111,7 @@ Some vision models also support video inputs. The message format is very similar
- The content `"type"` should be `"video"` to indicate the content is a video. - The content `"type"` should be `"video"` to indicate the content is a video.
- For videos, it can be a link to the video (`"url"`) or it could be a file path (`"path"`). Videos loaded from a URL can only be decoded with [PyAV](https://pyav.basswood-io.com/docs/stable/) or [Decord](https://github.com/dmlc/decord). - For videos, it can be a link to the video (`"url"`) or it could be a file path (`"path"`). Videos loaded from a URL can only be decoded with [PyAV](https://pyav.basswood-io.com/docs/stable/) or [Decord](https://github.com/dmlc/decord).
- In addition to loading videos from a URL or file path, you can also pass decoded video data directly. This is useful if youve already preprocessed or decoded video frames elsewhere in memory (e.g., using OpenCV, decord, or torchvision). You don't need to save to files or store it in an URL.
> [!WARNING] > [!WARNING]
> Loading a video from `"url"` is only supported by the PyAV or Decord backends. > Loading a video from `"url"` is only supported by the PyAV or Decord backends.
@@ -137,6 +138,52 @@ messages = [
] ]
``` ```
### Example: Passing decoded video objects
```python
import numpy as np
video_object1 = np.random.randint(0, 255, size=(16, 224, 224, 3), dtype=np.uint8),
messages = [
{
"role": "system",
"content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
},
{
"role": "user",
"content": [
{"type": "video", "video": video_object1},
{"type": "text", "text": "What do you see in this video?"}
],
},
]
```
You can also use existing (`"load_video()"`) function to load a video, edit the video in memory and pass it in the messages.
```python
# Make sure a video backend library (pyav, decord, or torchvision) is available.
from transformers.video_utils import load_video
# load a video file in memory for testing
video_object2, _ = load_video(
"https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4"
)
messages = [
{
"role": "system",
"content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
},
{
"role": "user",
"content": [
{"type": "video", "video": video_object2},
{"type": "text", "text": "What do you see in this video?"}
],
},
]
```
Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input content. There are a few extra parameters to include in [`~ProcessorMixin.apply_chat_template`] that controls the sampling process. Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input content. There are a few extra parameters to include in [`~ProcessorMixin.apply_chat_template`] that controls the sampling process.
The `video_load_backend` parameter refers to a specific framework to load a video. It supports [PyAV](https://pyav.basswood-io.com/docs/stable/), [Decord](https://github.com/dmlc/decord), [OpenCV](https://github.com/opencv/opencv), and [torchvision](https://pytorch.org/vision/stable/index.html). The `video_load_backend` parameter refers to a specific framework to load a video. It supports [PyAV](https://pyav.basswood-io.com/docs/stable/), [Decord](https://github.com/dmlc/decord), [OpenCV](https://github.com/opencv/opencv), and [torchvision](https://pytorch.org/vision/stable/index.html).

View File

@@ -31,6 +31,8 @@ import numpy as np
import typing_extensions import typing_extensions
from huggingface_hub.errors import EntryNotFoundError from huggingface_hub.errors import EntryNotFoundError
from transformers.utils import is_torch_available
from .audio_utils import load_audio from .audio_utils import load_audio
from .dynamic_module_utils import custom_object_save from .dynamic_module_utils import custom_object_save
from .feature_extraction_utils import BatchFeature from .feature_extraction_utils import BatchFeature
@@ -42,6 +44,7 @@ from .video_utils import VideoMetadata, load_video
if is_vision_available(): if is_vision_available():
from .image_utils import PILImageResampling from .image_utils import PILImageResampling
from .tokenization_utils_base import ( from .tokenization_utils_base import (
PaddingStrategy, PaddingStrategy,
PreTokenizedInput, PreTokenizedInput,
@@ -63,7 +66,6 @@ from .utils import (
download_url, download_url,
is_offline_mode, is_offline_mode,
is_remote_url, is_remote_url,
is_torch_available,
list_repo_templates, list_repo_templates,
logging, logging,
) )
@@ -1559,8 +1561,8 @@ class ProcessorMixin(PushToHubMixin):
for fname in video_fnames: for fname in video_fnames:
if isinstance(fname, (list, tuple)) and isinstance(fname[0], str): if isinstance(fname, (list, tuple)) and isinstance(fname[0], str):
# Case a: Video is provided as a list of image file names
video = [np.array(load_image(image_fname)) for image_fname in fname] video = [np.array(load_image(image_fname)) for image_fname in fname]
# create a 4D video because `load_video` always returns a 4D array
video = np.stack(video) video = np.stack(video)
metadata = None metadata = None
logger.warning( logger.warning(
@@ -1568,6 +1570,7 @@ class ProcessorMixin(PushToHubMixin):
"If your model requires metadata during processing, please load the whole video and let the processor sample frames instead." "If your model requires metadata during processing, please load the whole video and let the processor sample frames instead."
) )
else: else:
# Case b: Video is provided as a single file path or URL or decoded frames in a np.ndarray or torch.tensor
video, metadata = load_video( video, metadata = load_video(
fname, fname,
backend=mm_load_kwargs["video_load_backend"], backend=mm_load_kwargs["video_load_backend"],

View File

@@ -563,6 +563,14 @@ def load_video(
sample_indices_fn = sample_indices_fn_func sample_indices_fn = sample_indices_fn_func
if is_valid_image(video) or (isinstance(video, (list, tuple)) and is_valid_image(video[0])):
# Case 1: Video is provided as a 4D numpy array or torch tensor (frames, height, width, channels)
if not is_valid_video(video):
raise ValueError(
f"When passing video as decoded frames, video should be a 4D numpy array or torch tensor, but got {video.ndim} dimensions instead."
)
return video, None
if urlparse(video).netloc in ["www.youtube.com", "youtube.com"]: if urlparse(video).netloc in ["www.youtube.com", "youtube.com"]:
if not is_yt_dlp_available(): if not is_yt_dlp_available():
raise ImportError("To load a video from YouTube url you have to install `yt_dlp` first.") raise ImportError("To load a video from YouTube url you have to install `yt_dlp` first.")
@@ -579,8 +587,6 @@ def load_video(
file_obj = BytesIO(requests.get(video).content) file_obj = BytesIO(requests.get(video).content)
elif os.path.isfile(video): elif os.path.isfile(video):
file_obj = video file_obj = video
elif is_valid_image(video) or (isinstance(video, (list, tuple)) and is_valid_image(video[0])):
file_obj = None
else: else:
raise TypeError("Incorrect format used for video. Should be an url linking to an video or a local path.") raise TypeError("Incorrect format used for video. Should be an url linking to an video or a local path.")

View File

@@ -267,7 +267,7 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 2) self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 2)
@require_av @require_av
@parameterized.expand([(1, "pt"), (2, "pt")]) @parameterized.expand([(1, "pt"), (2, "pt"), (3, "pt")])
def test_apply_chat_template_video(self, batch_size: int, return_tensors: str): def test_apply_chat_template_video(self, batch_size: int, return_tensors: str):
processor = self.get_processor() processor = self.get_processor()
if processor.chat_template is None: if processor.chat_template is None:
@@ -340,7 +340,12 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
self.assertEqual(len(out_dict["input_ids"]), batch_size) self.assertEqual(len(out_dict["input_ids"]), batch_size)
self.assertEqual(len(out_dict["attention_mask"]), batch_size) self.assertEqual(len(out_dict["attention_mask"]), batch_size)
video_len = 2 if batch_size == 1 else 3 # InternVL patches out and removes frames after processing # InternVL internally collects frames from all the videos in a batch and flattens the batch dimension (B T C H W) -> (B*T C H W) then patches and removes the frames
# hence output length does not equal batch size
# removed hardcoded video length check video_len = 2 if batch_size == 1 else 3
# from experiment video_len looks like batch_size + 1
# TODO: update expected video_len calculation based on the internal processing logic of InternVLProcessor
video_len = batch_size + 1
self.assertEqual(len(out_dict[self.videos_input_name]), video_len) self.assertEqual(len(out_dict[self.videos_input_name]), video_len)
for k in out_dict: for k in out_dict:
self.assertIsInstance(out_dict[k], torch.Tensor) self.assertIsInstance(out_dict[k], torch.Tensor)

View File

@@ -422,8 +422,14 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
self.assertEqual(len(out_dict["input_ids"]), batch_size) self.assertEqual(len(out_dict["input_ids"]), batch_size)
self.assertEqual(len(out_dict["attention_mask"]), batch_size) self.assertEqual(len(out_dict["attention_mask"]), batch_size)
video_len = 2880 if batch_size == 1 else 5808 # qwen pixels don't scale with bs same way as other models if modality == "video":
mm_len = batch_size * 1564 if modality == "image" else video_len # qwen pixels don't scale with bs same way as other models, calculate expected video token count based on video_grid_thw
expected_video_token_count = 0
for thw in out_dict["video_grid_thw"]:
expected_video_token_count += thw[0] * thw[1] * thw[2]
mm_len = expected_video_token_count
else:
mm_len = batch_size * 1564
self.assertEqual(len(out_dict[input_name]), mm_len) self.assertEqual(len(out_dict[input_name]), mm_len)
return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list} return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list}

View File

@@ -239,8 +239,14 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
self.assertEqual(len(out_dict["input_ids"]), batch_size) self.assertEqual(len(out_dict["input_ids"]), batch_size)
self.assertEqual(len(out_dict["attention_mask"]), batch_size) self.assertEqual(len(out_dict["attention_mask"]), batch_size)
video_len = 180 if batch_size == 1 else 320 # qwen pixels don't scale with bs same way as other models if modality == "video":
mm_len = batch_size * 192 if modality == "image" else video_len # qwen pixels don't scale with bs same way as other models, calculate expected video token count based on video_grid_thw
expected_video_token_count = 0
for thw in out_dict["video_grid_thw"]:
expected_video_token_count += thw[0] * thw[1] * thw[2]
mm_len = expected_video_token_count
else:
mm_len = batch_size * 192
self.assertEqual(len(out_dict[input_name]), mm_len) self.assertEqual(len(out_dict[input_name]), mm_len)
return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list} return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list}

View File

@@ -239,9 +239,14 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
self.assertTrue(input_name in out_dict) self.assertTrue(input_name in out_dict)
self.assertEqual(len(out_dict["input_ids"]), batch_size) self.assertEqual(len(out_dict["input_ids"]), batch_size)
self.assertEqual(len(out_dict["attention_mask"]), batch_size) self.assertEqual(len(out_dict["attention_mask"]), batch_size)
if modality == "video":
video_len = 180 if batch_size == 1 else 320 # qwen pixels don't scale with bs same way as other models # qwen pixels don't scale with bs same way as other models, calculate expected video token count based on video_grid_thw
mm_len = batch_size * 192 if modality == "image" else video_len expected_video_token_count = 0
for thw in out_dict["video_grid_thw"]:
expected_video_token_count += thw[0] * thw[1] * thw[2]
mm_len = expected_video_token_count
else:
mm_len = batch_size * 192
self.assertEqual(len(out_dict[input_name]), mm_len) self.assertEqual(len(out_dict[input_name]), mm_len)
return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list} return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list}

View File

@@ -596,3 +596,9 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
@unittest.skip("SmolVLM cannot accept image URL as video frames, because it needs to know video fps and duration") @unittest.skip("SmolVLM cannot accept image URL as video frames, because it needs to know video fps and duration")
def test_apply_chat_template_video_1(self): def test_apply_chat_template_video_1(self):
pass pass
@unittest.skip(
"SmolVLM cannot accept list of decoded video frames, because it needs to know video fps and duration"
)
def test_apply_chat_template_video_2(self):
pass

View File

@@ -33,7 +33,7 @@ from transformers.testing_utils import (
require_torch, require_torch,
require_vision, require_vision,
) )
from transformers.utils import is_torch_available, is_vision_available from transformers.utils import is_av_available, is_torch_available, is_vision_available
global_rng = random.Random() global_rng = random.Random()
@@ -44,7 +44,6 @@ if is_vision_available():
if is_torch_available(): if is_torch_available():
import torch import torch
MODALITY_INPUT_DATA = { MODALITY_INPUT_DATA = {
"images": [ "images": [
"http://images.cocodataset.org/val2017/000000039769.jpg", "http://images.cocodataset.org/val2017/000000039769.jpg",
@@ -60,6 +59,13 @@ MODALITY_INPUT_DATA = {
], ],
} }
if is_av_available():
from transformers.video_utils import load_video
# load a video file in memory for testing
video, _ = load_video("https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4")
MODALITY_INPUT_DATA["videos"].append(video)
def prepare_image_inputs(): def prepare_image_inputs():
"""This function prepares a list of PIL images""" """This function prepares a list of PIL images"""
@@ -931,7 +937,7 @@ class ProcessorTesterMixin:
) )
@require_av @require_av
@parameterized.expand([(1, "pt"), (2, "pt")]) # video processor supports only torchvision @parameterized.expand([(1, "pt"), (2, "pt"), (3, "pt")]) # video processor supports only torchvision
def test_apply_chat_template_video(self, batch_size: int, return_tensors: str): def test_apply_chat_template_video(self, batch_size: int, return_tensors: str):
self._test_apply_chat_template( self._test_apply_chat_template(
"video", batch_size, return_tensors, "videos_input_name", "video_processor", MODALITY_INPUT_DATA["videos"] "video", batch_size, return_tensors, "videos_input_name", "video_processor", MODALITY_INPUT_DATA["videos"]