[video processors] support frame sampling within processors (#38105)

* apply updates smolVLM (still needs workaround for chat template)

* add other models

* dump qwen omni for now, come back later

* port qwen omni from their impl

* wait, all qwens sample videos in same way!

* clean up

* make smolvlm backwards compatible and fix padding

* dix some tests

* fox smolvlm tests

* more clean up and test fixing

* delete unused arg

* fix

* address comments

* style

* fix test
This commit is contained in:
Raushan Turganbay
2025-06-12 11:34:30 +02:00
committed by GitHub
parent 887054c714
commit 27459025b8
25 changed files with 864 additions and 795 deletions

View File

@@ -17,7 +17,6 @@ import shutil
import tempfile
import unittest
from huggingface_hub import hf_hub_download
from parameterized import parameterized
from transformers import AutoProcessor, AutoTokenizer, InternVLProcessor
@@ -180,77 +179,6 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
)
images_patches_index += inputs["pixel_values"].shape[0]
# Override video chat_template tests as InternVLProcessor returns flattened video features
@require_av
@require_torch
def test_apply_chat_template_video_special_processing(self):
"""
Tests that models can use their own preprocessing to preprocess conversations.
"""
processor = self.get_processor()
if processor.chat_template is None:
self.skipTest("Processor has no chat template")
signature = inspect.signature(processor.__call__)
if "videos" not in {*signature.parameters.keys()} or (
signature.parameters.get("videos") is not None
and signature.parameters["videos"].annotation == inspect._empty
):
self.skipTest("Processor doesn't accept videos at input")
video_file_path = hf_hub_download(
repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
)
messages = [
[
{
"role": "user",
"content": [
{"type": "video", "path": video_file_path},
{"type": "text", "text": "What is shown in this video?"},
],
},
]
]
def _process_messages_for_chat_template(
conversation,
batch_images,
batch_videos,
batch_video_metadata,
**chat_template_kwargs,
):
# Let us just always return a dummy prompt
new_msg = [
[
{
"role": "user",
"content": [
{"type": "video"}, # no need to use path, video is loaded already by this moment
{"type": "text", "text": "Dummy prompt for preprocess testing"},
],
},
]
]
return new_msg
processor._process_messages_for_chat_template = _process_messages_for_chat_template
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
num_frames=8,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
# Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
# Difference with common tests, InternVLProcessor returns flattened video features, and uses 8 frames by default
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 8)
@require_torch
@require_av
def test_apply_chat_template_video_frame_sampling(self):
@@ -393,13 +321,13 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
tokenize=True,
return_dict=True,
return_tensors="pt",
num_frames=4, # by default no more than 4 frames, otherwise too slow
num_frames=2, # by default no more than 2 frames, otherwise too slow
)
self.assertTrue(self.videos_input_name in out_dict)
self.assertEqual(len(out_dict["input_ids"]), batch_size)
self.assertEqual(len(out_dict["attention_mask"]), batch_size)
video_len = 4 if batch_size == 1 else 3 # InternVL patches out and removes frames after processing
video_len = 2 if batch_size == 1 else 3 # InternVL patches out and removes frames after processing
self.assertEqual(len(out_dict[self.videos_input_name]), video_len)
for k in out_dict:
self.assertIsInstance(out_dict[k], torch.Tensor)

View File

@@ -407,14 +407,14 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
tokenize=True,
return_dict=True,
return_tensors=return_tensors,
num_frames=4, # by default no more than 4 frames, otherwise too slow
num_frames=2, # by default no more than 2 frames, otherwise too slow
)
input_name = getattr(self, input_name)
self.assertTrue(input_name in out_dict)
self.assertEqual(len(out_dict["input_ids"]), batch_size)
self.assertEqual(len(out_dict["attention_mask"]), batch_size)
video_len = 5760 if batch_size == 1 else 5808 # qwen pixels don't scale with bs same way as other models
video_len = 2880 if batch_size == 1 else 5808 # qwen pixels don't scale with bs same way as other models
mm_len = batch_size * 1564 if modality == "image" else video_len
self.assertEqual(len(out_dict[input_name]), mm_len)
@@ -525,72 +525,6 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 2904)
@require_av
def test_apply_chat_template_video_special_processing(self):
"""
Tests that models can use their own preprocessing to preprocess conversations.
"""
processor = self.get_processor()
if processor.chat_template is None:
self.skipTest("Processor has no chat template")
signature = inspect.signature(processor.__call__)
if "videos" not in {*signature.parameters.keys()} or (
signature.parameters.get("videos") is not None
and signature.parameters["videos"].annotation == inspect._empty
):
self.skipTest("Processor doesn't accept videos at input")
video_file_path = hf_hub_download(
repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
)
messages = [
[
{
"role": "user",
"content": [
{"type": "video", "path": video_file_path},
{"type": "text", "text": "What is shown in this video?"},
],
},
]
]
def _process_messages_for_chat_template(
conversation,
batch_images,
batch_videos,
batch_video_metadata,
**chat_template_kwargs,
):
# Let us just always return a dummy prompt
new_msg = [
[
{
"role": "user",
"content": [
{"type": "video"}, # no need to use path, video is loaded already by this moment
{"type": "text", "text": "Dummy prompt for preprocess testing"},
],
},
]
]
return new_msg
processor._process_messages_for_chat_template = _process_messages_for_chat_template
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
# Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 145912)
@require_librosa
@require_av
@unittest.skip(

View File

@@ -19,7 +19,6 @@ import unittest
import numpy as np
import pytest
from huggingface_hub import hf_hub_download
from transformers import AutoProcessor, Qwen2Tokenizer
from transformers.testing_utils import require_av, require_torch, require_vision
@@ -219,14 +218,14 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
tokenize=True,
return_dict=True,
return_tensors=return_tensors,
num_frames=4, # by default no more than 4 frames, otherwise too slow
num_frames=2, # by default no more than 2 frames, otherwise too slow
)
input_name = getattr(self, input_name)
self.assertTrue(input_name in out_dict)
self.assertEqual(len(out_dict["input_ids"]), batch_size)
self.assertEqual(len(out_dict["attention_mask"]), batch_size)
video_len = 360 if batch_size == 1 else 320 # qwen pixels don't scale with bs same way as other models
video_len = 180 if batch_size == 1 else 320 # qwen pixels don't scale with bs same way as other models
mm_len = batch_size * 192 if modality == "image" else video_len
self.assertEqual(len(out_dict[input_name]), mm_len)
@@ -346,70 +345,3 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
self.assertEqual(inputs[self.images_input_name].shape[0], 612)
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
self.assertEqual(inputs[self.images_input_name].shape[0], 100)
@require_av
def test_apply_chat_template_video_special_processing(self):
"""
Tests that models can use their own preprocessing to preprocess conversations.
"""
processor = self.get_processor()
if processor.chat_template is None:
self.skipTest("Processor has no chat template")
signature = inspect.signature(processor.__call__)
if "videos" not in {*signature.parameters.keys()} or (
signature.parameters.get("videos") is not None
and signature.parameters["videos"].annotation == inspect._empty
):
self.skipTest("Processor doesn't accept videos at input")
video_file_path = hf_hub_download(
repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
)
messages = [
[
{
"role": "user",
"content": [
{"type": "video", "path": video_file_path},
{"type": "text", "text": "What is shown in this video?"},
],
},
]
]
def _process_messages_for_chat_template(
conversation,
batch_images,
batch_videos,
batch_video_metadata,
**chat_template_kwargs,
):
# Let us just always return a dummy prompt
new_msg = [
[
{
"role": "user",
"content": [
{"type": "video"}, # no need to use path, video is loaded already by this moment
{"type": "text", "text": "Dummy prompt for preprocess testing"},
],
},
]
]
return new_msg
processor._process_messages_for_chat_template = _process_messages_for_chat_template
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
# Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 21960)

View File

@@ -19,7 +19,6 @@ import unittest
import numpy as np
import pytest
from huggingface_hub import hf_hub_download
from transformers import AutoProcessor, Qwen2Tokenizer
from transformers.testing_utils import require_av, require_torch, require_vision
@@ -220,14 +219,14 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
tokenize=True,
return_dict=True,
return_tensors=return_tensors,
num_frames=4, # by default no more than 4 frames, otherwise too slow
num_frames=2, # by default no more than 2 frames, otherwise too slow
)
input_name = getattr(self, input_name)
self.assertTrue(input_name in out_dict)
self.assertEqual(len(out_dict["input_ids"]), batch_size)
self.assertEqual(len(out_dict["attention_mask"]), batch_size)
video_len = 360 if batch_size == 1 else 320 # qwen pixels don't scale with bs same way as other models
video_len = 180 if batch_size == 1 else 320 # qwen pixels don't scale with bs same way as other models
mm_len = batch_size * 192 if modality == "image" else video_len
self.assertEqual(len(out_dict[input_name]), mm_len)
@@ -337,73 +336,6 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 160)
@require_av
def test_apply_chat_template_video_special_processing(self):
"""
Tests that models can use their own preprocessing to preprocess conversations.
"""
processor = self.get_processor()
if processor.chat_template is None:
self.skipTest("Processor has no chat template")
signature = inspect.signature(processor.__call__)
if "videos" not in {*signature.parameters.keys()} or (
signature.parameters.get("videos") is not None
and signature.parameters["videos"].annotation == inspect._empty
):
self.skipTest("Processor doesn't accept videos at input")
video_file_path = hf_hub_download(
repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
)
messages = [
[
{
"role": "user",
"content": [
{"type": "video", "path": video_file_path},
{"type": "text", "text": "What is shown in this video?"},
],
},
]
]
def _process_messages_for_chat_template(
conversation,
batch_images,
batch_videos,
batch_video_metadata,
**chat_template_kwargs,
):
# Let us just always return a dummy prompt
new_msg = [
[
{
"role": "user",
"content": [
{"type": "video"}, # no need to use path, video is loaded already by this moment
{"type": "text", "text": "Dummy prompt for preprocess testing"},
],
},
]
]
return new_msg
processor._process_messages_for_chat_template = _process_messages_for_chat_template
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
# Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 21960)
def test_kwargs_overrides_custom_image_processor_kwargs(self):
processor = self.get_processor()
self.skip_processor_without_typed_kwargs(processor)

View File

@@ -99,8 +99,9 @@ class Qwen2VLVideoProcessingTester:
}
@require_vision
def expected_output_video_shape(self, videos):
grid_t = self.num_frames // self.temporal_patch_size
def expected_output_video_shape(self, videos, num_frames=None):
num_frames = num_frames if num_frames is not None else self.num_frames
grid_t = num_frames // self.temporal_patch_size
hidden_dim = self.num_channels * self.temporal_patch_size * self.patch_size * self.patch_size
seq_len = 0
for video in videos:
@@ -289,3 +290,70 @@ class Qwen2VLVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
)[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
def test_call_sample_frames(self):
for video_processing_class in self.video_processor_list:
video_processing = video_processing_class(**self.video_processor_dict)
prev_num_frames = self.video_processor_tester.num_frames
self.video_processor_tester.num_frames = 8
video_inputs = self.video_processor_tester.prepare_video_inputs(
equal_resolution=False,
return_tensors="torch",
)
# Force set sampling to False. No sampling is expected even when `num_frames` exists
video_processing.do_sample_frames = False
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=3)[self.input_name]
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=3)[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
expected_output_video_shape_batched = self.video_processor_tester.expected_output_video_shape(video_inputs)
self.assertListEqual(list(encoded_videos.shape), expected_output_video_shape)
self.assertListEqual(list(encoded_videos_batched.shape), expected_output_video_shape_batched)
# Set sampling to True. Video frames should be sampled with `num_frames` in the output
video_processing.do_sample_frames = True
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=4)[self.input_name]
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=4)[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(
[video_inputs[0]], num_frames=4
)
expected_output_video_shape_batched = self.video_processor_tester.expected_output_video_shape(
video_inputs, num_frames=4
)
self.assertListEqual(list(encoded_videos.shape), expected_output_video_shape)
self.assertListEqual(list(encoded_videos_batched.shape), expected_output_video_shape_batched)
# Sample with `fps` requires metadata to infer number of frames from total duration
with self.assertRaises(ValueError):
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=3)[self.input_name]
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", fps=3)[self.input_name]
metadata = [[{"duration": 2.0, "total_num_frames": 8, "fps": 4}]]
batched_metadata = metadata * len(video_inputs)
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=3, video_metadata=metadata)[
self.input_name
]
encoded_videos_batched = video_processing(
video_inputs, return_tensors="pt", fps=3, video_metadata=batched_metadata
)[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(
[video_inputs[0]], num_frames=6
)
expected_output_video_shape_batched = self.video_processor_tester.expected_output_video_shape(
video_inputs, num_frames=6
)
self.assertListEqual(list(encoded_videos.shape), expected_output_video_shape)
self.assertListEqual(list(encoded_videos_batched.shape), expected_output_video_shape_batched)
# We should raise error when asked to sample more frames than there are in input video
with self.assertRaises(ValueError):
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=10)[self.input_name]
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=10)[
self.input_name
]
# Assign back the actual num frames in tester
self.video_processor_tester.num_frames = prev_num_frames

View File

@@ -16,6 +16,7 @@ import shutil
import tempfile
import unittest
from io import BytesIO
from typing import Optional
import numpy as np
import requests
@@ -63,7 +64,7 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
)
cls.bos_token = processor.tokenizer.bos_token
cls.image_token = processor.image_token
cls.video_token = processor.image_token * 8 # SmolVLM uses image token and repeats it `num_frames` times
cls.video_token = processor.video_token
cls.fake_image_token = processor.fake_image_token
cls.global_img_token = processor.global_image_token
@@ -93,6 +94,13 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
"chat_template": "<|im_start|>{% for message in messages %}{{message['role'] | capitalize}}{% if message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif %}{% for line in message['content'] %}{% if line['type'] == 'text' %}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% endif %}{% endfor %}<end_of_utterance>\n{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
}
def prepare_video_inputs(self, batch_size: Optional[int] = None):
"""This function prepares a list of numpy videos."""
video_input = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)] * 8
if batch_size is None:
return [[video_input]]
return [[video_input]] * batch_size
def get_split_image_expected_tokens(self, processor, image_rows, image_cols):
text_split_images = []
for n_h in range(image_rows):
@@ -347,7 +355,6 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
{"type": "text", "text": "What do these images show?"},
{"type": "image"},
{"type": "image"},
"What do these images show?",
],
},
{
@@ -373,11 +380,8 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
)
self.assertEqual(rendered, expected_rendered)
@unittest.skip(reason="SmolVLM replaced `type=video` with `type=image` in chat templates")
def test_apply_chat_template_video_special_processing(self):
pass
@require_av
@require_torch
def test_apply_chat_template_video_frame_sampling(self):
# overridden because SmolVLM has special preprocessing for videos
processor = self.get_processor()
@@ -406,7 +410,7 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
tokenize=True,
return_dict=True,
num_frames=num_frames,
return_tensors="np",
return_tensors="pt",
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
@@ -421,7 +425,7 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
tokenize=True,
return_dict=True,
video_fps=video_fps,
return_tensors="np",
return_tensors="pt",
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
@@ -482,11 +486,11 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
do_rescale=True,
rescale_factor=-1,
padding="max_length",
max_length=76,
max_length=172,
)
self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
self.assertEqual(len(inputs["input_ids"][0]), 76)
self.assertEqual(len(inputs["input_ids"][0]), 172)
@require_torch
@require_vision

View File

@@ -15,22 +15,16 @@
import unittest
import numpy as np
from transformers.image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from transformers.utils import is_torchvision_available, is_vision_available
from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs
if is_torch_available():
import torch
if is_vision_available():
if is_torchvision_available():
from transformers import SmolVLMVideoProcessor
from transformers.models.smolvlm.video_processing_smolvlm import get_resize_output_image_size
class SmolVLMVideoProcessingTester:
@@ -58,6 +52,7 @@ class SmolVLMVideoProcessingTester:
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.max_image_size = size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
@@ -71,17 +66,16 @@ class SmolVLMVideoProcessingTester:
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_convert_rgb": self.do_convert_rgb,
"max_image_size": self.max_image_size,
}
def expected_output_video_shape(self, videos):
max_height, max_width = 0, 0
if not isinstance(videos[0], torch.Tensor):
videos = [torch.tensor(np.array(video)).permute(0, -1, -3, -2) for video in videos]
for video in videos:
height, width = get_resize_output_image_size(video, self.size["longest_edge"])
max_height = max(height, max_height)
max_width = max(width, max_width)
return [self.num_frames, self.num_channels, max_height, max_width]
return [
self.num_frames,
self.num_channels,
self.max_image_size["longest_edge"],
self.max_image_size["longest_edge"],
]
def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
videos = prepare_video_inputs(
@@ -116,3 +110,58 @@ class SmolVLMVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict, size=42)
self.assertEqual(video_processor.size, {"height": 42, "width": 42})
# overwrite, SmolVLM requires to have metadata no matter how we sample
def test_call_sample_frames(self):
for video_processing_class in self.video_processor_list:
video_processing = video_processing_class(**self.video_processor_dict)
prev_num_frames = self.video_processor_tester.num_frames
self.video_processor_tester.num_frames = 8
video_inputs = self.video_processor_tester.prepare_video_inputs(
equal_resolution=False,
return_tensors="torch",
)
# Force set sampling to False. No sampling is expected even when `num_frames` exists
video_processing.do_sample_frames = False
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=3)[self.input_name]
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=3)[self.input_name]
self.assertEqual(encoded_videos.shape[1], 8)
self.assertEqual(encoded_videos_batched.shape[1], 8)
# Set sampling to True. Video frames should be sampled with `num_frames` in the output
video_processing.do_sample_frames = True
metadata = [[{"duration": 2.0, "total_num_frames": 8, "fps": 4}]]
batched_metadata = metadata * len(video_inputs)
# Sample with `fps` requires metadata to infer number of frames from total duration
with self.assertRaises(ValueError):
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=6, fps=3)[
self.input_name
]
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=6, fps=3)[
self.input_name
]
encoded_videos = video_processing(
video_inputs[0], return_tensors="pt", num_frames=6, fps=3, video_metadata=metadata
)[self.input_name]
encoded_videos_batched = video_processing(
video_inputs, return_tensors="pt", num_frames=6, fps=3, video_metadata=batched_metadata
)[self.input_name]
self.assertEqual(encoded_videos.shape[1], 6)
self.assertEqual(encoded_videos_batched.shape[1], 6)
# We should raise error when asked to sample more frames than there are in input video
with self.assertRaises(ValueError):
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=10, num_frames=20)[
self.input_name
]
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", fps=10, num_frames=20)[
self.input_name
]
# Assign back the actual num frames in tester
self.video_processor_tester.num_frames = prev_num_frames

View File

@@ -507,7 +507,7 @@ class ProcessorTesterMixin:
if "video_processor" not in self.processor_class.attributes:
self.skipTest(f"video_processor attribute not present in {self.processor_class}")
processor_components = self.prepare_components()
processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
processor_components["tokenizer"] = self.get_component("tokenizer", max_length=167, padding="max_length")
processor_kwargs = self.prepare_processor_dict()
processor = self.processor_class(**processor_components, **processor_kwargs)
@@ -515,7 +515,7 @@ class ProcessorTesterMixin:
input_str = self.prepare_text_inputs(modality="video")
video_input = self.prepare_video_inputs()
inputs = processor(text=input_str, videos=video_input, return_tensors="pt")
self.assertEqual(inputs[self.text_input_name].shape[-1], 117)
self.assertEqual(inputs[self.text_input_name].shape[-1], 167)
def test_video_processor_defaults_preserved_by_video_kwargs(self):
"""
@@ -529,7 +529,7 @@ class ProcessorTesterMixin:
processor_components["video_processor"] = self.get_component(
"video_processor", do_rescale=True, rescale_factor=-1
)
processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
processor_components["tokenizer"] = self.get_component("tokenizer", max_length=167, padding="max_length")
processor_kwargs = self.prepare_processor_dict()
processor = self.processor_class(**processor_components, **processor_kwargs)
@@ -553,9 +553,9 @@ class ProcessorTesterMixin:
input_str = self.prepare_text_inputs(modality="video")
video_input = self.prepare_video_inputs()
inputs = processor(
text=input_str, videos=video_input, return_tensors="pt", max_length=112, padding="max_length"
text=input_str, videos=video_input, return_tensors="pt", max_length=162, padding="max_length"
)
self.assertEqual(inputs[self.text_input_name].shape[-1], 112)
self.assertEqual(inputs[self.text_input_name].shape[-1], 162)
def test_kwargs_overrides_default_video_processor_kwargs(self):
if "video_processor" not in self.processor_class.attributes:
@@ -564,7 +564,7 @@ class ProcessorTesterMixin:
processor_components["video_processor"] = self.get_component(
"video_processor", do_rescale=True, rescale_factor=1
)
processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
processor_components["tokenizer"] = self.get_component("tokenizer", max_length=167, padding="max_length")
processor_kwargs = self.prepare_processor_dict()
processor = self.processor_class(**processor_components, **processor_kwargs)
@@ -593,11 +593,11 @@ class ProcessorTesterMixin:
do_rescale=True,
rescale_factor=-1,
padding="max_length",
max_length=76,
max_length=176,
)
self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
self.assertEqual(inputs[self.text_input_name].shape[-1], 76)
self.assertEqual(inputs[self.text_input_name].shape[-1], 176)
def test_unstructured_kwargs_batched_video(self):
if "video_processor" not in self.processor_class.attributes:
@@ -616,13 +616,13 @@ class ProcessorTesterMixin:
do_rescale=True,
rescale_factor=-1,
padding="longest",
max_length=76,
max_length=176,
)
self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
self.assertTrue(
len(inputs[self.text_input_name][0]) == len(inputs[self.text_input_name][1])
and len(inputs[self.text_input_name][1]) < 76
and len(inputs[self.text_input_name][1]) < 176
)
def test_doubly_passed_kwargs_video(self):
@@ -659,14 +659,14 @@ class ProcessorTesterMixin:
all_kwargs = {
"common_kwargs": {"return_tensors": "pt"},
"videos_kwargs": {"do_rescale": True, "rescale_factor": -1},
"text_kwargs": {"padding": "max_length", "max_length": 76},
"text_kwargs": {"padding": "max_length", "max_length": 176},
}
inputs = processor(text=input_str, videos=video_input, **all_kwargs)
self.skip_processor_without_typed_kwargs(processor)
self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
self.assertEqual(inputs[self.text_input_name].shape[-1], 76)
self.assertEqual(inputs[self.text_input_name].shape[-1], 176)
def test_structured_kwargs_nested_from_dict_video(self):
if "video_processor" not in self.processor_class.attributes:
@@ -682,12 +682,12 @@ class ProcessorTesterMixin:
all_kwargs = {
"common_kwargs": {"return_tensors": "pt"},
"videos_kwargs": {"do_rescale": True, "rescale_factor": -1},
"text_kwargs": {"padding": "max_length", "max_length": 76},
"text_kwargs": {"padding": "max_length", "max_length": 176},
}
inputs = processor(text=input_str, videos=video_input, **all_kwargs)
self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
self.assertEqual(inputs[self.text_input_name].shape[-1], 76)
self.assertEqual(inputs[self.text_input_name].shape[-1], 176)
# TODO: the same test, but for audio + text processors that have strong overlap in kwargs
# TODO (molbap) use the same structure of attribute kwargs for other tests to avoid duplication
@@ -884,7 +884,7 @@ class ProcessorTesterMixin:
tokenize=True,
return_dict=True,
return_tensors=return_tensors,
num_frames=4, # by default no more than 4 frames, otherwise too slow
num_frames=2, # by default no more than 2 frames, otherwise too slow
)
input_name = getattr(self, input_name)
self.assertTrue(input_name in out_dict)
@@ -983,6 +983,21 @@ class ProcessorTesterMixin:
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), video_fps * 10)
# Whan `do_sample_frames=False` no sampling is done and whole video is loaded, even if number of frames is passed
video_fps = 1
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
do_sample_frames=False,
video_fps=video_fps,
return_tensors="pt",
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 300)
# Load with `video_fps` and `num_frames` args, should raise an error
with self.assertRaises(ValueError):
out_dict_with_video = processor.apply_chat_template(
@@ -1024,75 +1039,6 @@ class ProcessorTesterMixin:
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 2)
@require_av
@require_torch
def test_apply_chat_template_video_special_processing(self):
"""
Tests that models can use their own preprocessing to preprocess conversations.
"""
processor = self.get_processor()
if processor.chat_template is None:
self.skipTest("Processor has no chat template")
signature = inspect.signature(processor.__call__)
if "videos" not in {*signature.parameters.keys()} or (
signature.parameters.get("videos") is not None
and signature.parameters["videos"].annotation == inspect._empty
):
self.skipTest("Processor doesn't accept videos at input")
video_file_path = hf_hub_download(
repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
)
messages = [
[
{
"role": "user",
"content": [
{"type": "video", "path": video_file_path},
{"type": "text", "text": "What is shown in this video?"},
],
},
]
]
def _process_messages_for_chat_template(
conversation,
batch_images,
batch_videos,
batch_video_metadata,
**chat_template_kwargs,
):
# Let us just always return a dummy prompt
new_msg = [
[
{
"role": "user",
"content": [
{"type": "video"}, # no need to use path, video is loaded already by this moment
{"type": "text", "text": "Dummy prompt for preprocess testing"},
],
},
]
]
return new_msg
processor._process_messages_for_chat_template = _process_messages_for_chat_template
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
# Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 243)
@require_librosa
@require_av
def test_chat_template_audio_from_video(self):

View File

@@ -293,6 +293,59 @@ class VideoProcessingTestMixin:
(self.video_processor_tester.batch_size, *expected_output_video_shape),
)
def test_call_sample_frames(self):
for video_processing_class in self.video_processor_list:
video_processing = video_processing_class(**self.video_processor_dict)
prev_num_frames = self.video_processor_tester.num_frames
self.video_processor_tester.num_frames = 8
video_inputs = self.video_processor_tester.prepare_video_inputs(
equal_resolution=False,
return_tensors="torch",
)
# Force set sampling to False. No sampling is expected even when `num_frames` exists
video_processing.do_sample_frames = False
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=3)[self.input_name]
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=3)[self.input_name]
self.assertEqual(encoded_videos.shape[1], 8)
self.assertEqual(encoded_videos_batched.shape[1], 8)
# Set sampling to True. Video frames should be sampled with `num_frames` in the output
video_processing.do_sample_frames = True
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=3)[self.input_name]
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=3)[self.input_name]
self.assertEqual(encoded_videos.shape[1], 3)
self.assertEqual(encoded_videos_batched.shape[1], 3)
# Sample with `fps` requires metadata to infer number of frames from total duration
with self.assertRaises(ValueError):
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=3)[self.input_name]
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", fps=3)[self.input_name]
metadata = [[{"duration": 2.0, "total_num_frames": 8, "fps": 4}]]
batched_metadata = metadata * len(video_inputs)
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=3, video_metadata=metadata)[
self.input_name
]
encoded_videos_batched = video_processing(
video_inputs, return_tensors="pt", fps=3, video_metadata=batched_metadata
)[self.input_name]
self.assertEqual(encoded_videos.shape[1], 6)
self.assertEqual(encoded_videos_batched.shape[1], 6)
# We should raise error when asked to sample more frames than there are in input video
with self.assertRaises(ValueError):
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=10)[self.input_name]
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=10)[
self.input_name
]
# Assign back the actual num frames in tester
self.video_processor_tester.num_frames = prev_num_frames
def test_nested_input(self):
"""Tests that the processor can work with nested list where each video is a list of arrays"""
for video_processing_class in self.video_processor_list: