[video processors] support frame sampling within processors (#38105)
* apply updates smolVLM (still needs workaround for chat template) * add other models * dump qwen omni for now, come back later * port qwen omni from their impl * wait, all qwens sample videos in same way! * clean up * make smolvlm backwards compatible and fix padding * dix some tests * fox smolvlm tests * more clean up and test fixing * delete unused arg * fix * address comments * style * fix test
This commit is contained in:
committed by
GitHub
parent
887054c714
commit
27459025b8
@@ -17,7 +17,6 @@ import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from huggingface_hub import hf_hub_download
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import AutoProcessor, AutoTokenizer, InternVLProcessor
|
||||
@@ -180,77 +179,6 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
)
|
||||
images_patches_index += inputs["pixel_values"].shape[0]
|
||||
|
||||
# Override video chat_template tests as InternVLProcessor returns flattened video features
|
||||
@require_av
|
||||
@require_torch
|
||||
def test_apply_chat_template_video_special_processing(self):
|
||||
"""
|
||||
Tests that models can use their own preprocessing to preprocess conversations.
|
||||
"""
|
||||
processor = self.get_processor()
|
||||
if processor.chat_template is None:
|
||||
self.skipTest("Processor has no chat template")
|
||||
|
||||
signature = inspect.signature(processor.__call__)
|
||||
if "videos" not in {*signature.parameters.keys()} or (
|
||||
signature.parameters.get("videos") is not None
|
||||
and signature.parameters["videos"].annotation == inspect._empty
|
||||
):
|
||||
self.skipTest("Processor doesn't accept videos at input")
|
||||
|
||||
video_file_path = hf_hub_download(
|
||||
repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
|
||||
)
|
||||
messages = [
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "video", "path": video_file_path},
|
||||
{"type": "text", "text": "What is shown in this video?"},
|
||||
],
|
||||
},
|
||||
]
|
||||
]
|
||||
|
||||
def _process_messages_for_chat_template(
|
||||
conversation,
|
||||
batch_images,
|
||||
batch_videos,
|
||||
batch_video_metadata,
|
||||
**chat_template_kwargs,
|
||||
):
|
||||
# Let us just always return a dummy prompt
|
||||
new_msg = [
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "video"}, # no need to use path, video is loaded already by this moment
|
||||
{"type": "text", "text": "Dummy prompt for preprocess testing"},
|
||||
],
|
||||
},
|
||||
]
|
||||
]
|
||||
return new_msg
|
||||
|
||||
processor._process_messages_for_chat_template = _process_messages_for_chat_template
|
||||
out_dict_with_video = processor.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
return_tensors="pt",
|
||||
num_frames=8,
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
|
||||
# Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
|
||||
formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
|
||||
self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
|
||||
# Difference with common tests, InternVLProcessor returns flattened video features, and uses 8 frames by default
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 8)
|
||||
|
||||
@require_torch
|
||||
@require_av
|
||||
def test_apply_chat_template_video_frame_sampling(self):
|
||||
@@ -393,13 +321,13 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
return_tensors="pt",
|
||||
num_frames=4, # by default no more than 4 frames, otherwise too slow
|
||||
num_frames=2, # by default no more than 2 frames, otherwise too slow
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict)
|
||||
self.assertEqual(len(out_dict["input_ids"]), batch_size)
|
||||
self.assertEqual(len(out_dict["attention_mask"]), batch_size)
|
||||
|
||||
video_len = 4 if batch_size == 1 else 3 # InternVL patches out and removes frames after processing
|
||||
video_len = 2 if batch_size == 1 else 3 # InternVL patches out and removes frames after processing
|
||||
self.assertEqual(len(out_dict[self.videos_input_name]), video_len)
|
||||
for k in out_dict:
|
||||
self.assertIsInstance(out_dict[k], torch.Tensor)
|
||||
|
||||
@@ -407,14 +407,14 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
return_tensors=return_tensors,
|
||||
num_frames=4, # by default no more than 4 frames, otherwise too slow
|
||||
num_frames=2, # by default no more than 2 frames, otherwise too slow
|
||||
)
|
||||
input_name = getattr(self, input_name)
|
||||
self.assertTrue(input_name in out_dict)
|
||||
self.assertEqual(len(out_dict["input_ids"]), batch_size)
|
||||
self.assertEqual(len(out_dict["attention_mask"]), batch_size)
|
||||
|
||||
video_len = 5760 if batch_size == 1 else 5808 # qwen pixels don't scale with bs same way as other models
|
||||
video_len = 2880 if batch_size == 1 else 5808 # qwen pixels don't scale with bs same way as other models
|
||||
mm_len = batch_size * 1564 if modality == "image" else video_len
|
||||
self.assertEqual(len(out_dict[input_name]), mm_len)
|
||||
|
||||
@@ -525,72 +525,6 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 2904)
|
||||
|
||||
@require_av
|
||||
def test_apply_chat_template_video_special_processing(self):
|
||||
"""
|
||||
Tests that models can use their own preprocessing to preprocess conversations.
|
||||
"""
|
||||
processor = self.get_processor()
|
||||
if processor.chat_template is None:
|
||||
self.skipTest("Processor has no chat template")
|
||||
|
||||
signature = inspect.signature(processor.__call__)
|
||||
if "videos" not in {*signature.parameters.keys()} or (
|
||||
signature.parameters.get("videos") is not None
|
||||
and signature.parameters["videos"].annotation == inspect._empty
|
||||
):
|
||||
self.skipTest("Processor doesn't accept videos at input")
|
||||
|
||||
video_file_path = hf_hub_download(
|
||||
repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
|
||||
)
|
||||
messages = [
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "video", "path": video_file_path},
|
||||
{"type": "text", "text": "What is shown in this video?"},
|
||||
],
|
||||
},
|
||||
]
|
||||
]
|
||||
|
||||
def _process_messages_for_chat_template(
|
||||
conversation,
|
||||
batch_images,
|
||||
batch_videos,
|
||||
batch_video_metadata,
|
||||
**chat_template_kwargs,
|
||||
):
|
||||
# Let us just always return a dummy prompt
|
||||
new_msg = [
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "video"}, # no need to use path, video is loaded already by this moment
|
||||
{"type": "text", "text": "Dummy prompt for preprocess testing"},
|
||||
],
|
||||
},
|
||||
]
|
||||
]
|
||||
return new_msg
|
||||
|
||||
processor._process_messages_for_chat_template = _process_messages_for_chat_template
|
||||
out_dict_with_video = processor.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
|
||||
# Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
|
||||
formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
|
||||
self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 145912)
|
||||
|
||||
@require_librosa
|
||||
@require_av
|
||||
@unittest.skip(
|
||||
|
||||
@@ -19,7 +19,6 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
from transformers import AutoProcessor, Qwen2Tokenizer
|
||||
from transformers.testing_utils import require_av, require_torch, require_vision
|
||||
@@ -219,14 +218,14 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
return_tensors=return_tensors,
|
||||
num_frames=4, # by default no more than 4 frames, otherwise too slow
|
||||
num_frames=2, # by default no more than 2 frames, otherwise too slow
|
||||
)
|
||||
input_name = getattr(self, input_name)
|
||||
self.assertTrue(input_name in out_dict)
|
||||
self.assertEqual(len(out_dict["input_ids"]), batch_size)
|
||||
self.assertEqual(len(out_dict["attention_mask"]), batch_size)
|
||||
|
||||
video_len = 360 if batch_size == 1 else 320 # qwen pixels don't scale with bs same way as other models
|
||||
video_len = 180 if batch_size == 1 else 320 # qwen pixels don't scale with bs same way as other models
|
||||
mm_len = batch_size * 192 if modality == "image" else video_len
|
||||
self.assertEqual(len(out_dict[input_name]), mm_len)
|
||||
|
||||
@@ -346,70 +345,3 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
self.assertEqual(inputs[self.images_input_name].shape[0], 612)
|
||||
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
|
||||
self.assertEqual(inputs[self.images_input_name].shape[0], 100)
|
||||
|
||||
@require_av
|
||||
def test_apply_chat_template_video_special_processing(self):
|
||||
"""
|
||||
Tests that models can use their own preprocessing to preprocess conversations.
|
||||
"""
|
||||
processor = self.get_processor()
|
||||
if processor.chat_template is None:
|
||||
self.skipTest("Processor has no chat template")
|
||||
|
||||
signature = inspect.signature(processor.__call__)
|
||||
if "videos" not in {*signature.parameters.keys()} or (
|
||||
signature.parameters.get("videos") is not None
|
||||
and signature.parameters["videos"].annotation == inspect._empty
|
||||
):
|
||||
self.skipTest("Processor doesn't accept videos at input")
|
||||
|
||||
video_file_path = hf_hub_download(
|
||||
repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
|
||||
)
|
||||
messages = [
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "video", "path": video_file_path},
|
||||
{"type": "text", "text": "What is shown in this video?"},
|
||||
],
|
||||
},
|
||||
]
|
||||
]
|
||||
|
||||
def _process_messages_for_chat_template(
|
||||
conversation,
|
||||
batch_images,
|
||||
batch_videos,
|
||||
batch_video_metadata,
|
||||
**chat_template_kwargs,
|
||||
):
|
||||
# Let us just always return a dummy prompt
|
||||
new_msg = [
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "video"}, # no need to use path, video is loaded already by this moment
|
||||
{"type": "text", "text": "Dummy prompt for preprocess testing"},
|
||||
],
|
||||
},
|
||||
]
|
||||
]
|
||||
return new_msg
|
||||
|
||||
processor._process_messages_for_chat_template = _process_messages_for_chat_template
|
||||
out_dict_with_video = processor.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
|
||||
# Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
|
||||
formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
|
||||
self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 21960)
|
||||
|
||||
@@ -19,7 +19,6 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
from transformers import AutoProcessor, Qwen2Tokenizer
|
||||
from transformers.testing_utils import require_av, require_torch, require_vision
|
||||
@@ -220,14 +219,14 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
return_tensors=return_tensors,
|
||||
num_frames=4, # by default no more than 4 frames, otherwise too slow
|
||||
num_frames=2, # by default no more than 2 frames, otherwise too slow
|
||||
)
|
||||
input_name = getattr(self, input_name)
|
||||
self.assertTrue(input_name in out_dict)
|
||||
self.assertEqual(len(out_dict["input_ids"]), batch_size)
|
||||
self.assertEqual(len(out_dict["attention_mask"]), batch_size)
|
||||
|
||||
video_len = 360 if batch_size == 1 else 320 # qwen pixels don't scale with bs same way as other models
|
||||
video_len = 180 if batch_size == 1 else 320 # qwen pixels don't scale with bs same way as other models
|
||||
mm_len = batch_size * 192 if modality == "image" else video_len
|
||||
self.assertEqual(len(out_dict[input_name]), mm_len)
|
||||
|
||||
@@ -337,73 +336,6 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 160)
|
||||
|
||||
@require_av
|
||||
def test_apply_chat_template_video_special_processing(self):
|
||||
"""
|
||||
Tests that models can use their own preprocessing to preprocess conversations.
|
||||
"""
|
||||
processor = self.get_processor()
|
||||
if processor.chat_template is None:
|
||||
self.skipTest("Processor has no chat template")
|
||||
|
||||
signature = inspect.signature(processor.__call__)
|
||||
if "videos" not in {*signature.parameters.keys()} or (
|
||||
signature.parameters.get("videos") is not None
|
||||
and signature.parameters["videos"].annotation == inspect._empty
|
||||
):
|
||||
self.skipTest("Processor doesn't accept videos at input")
|
||||
|
||||
video_file_path = hf_hub_download(
|
||||
repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
|
||||
)
|
||||
messages = [
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "video", "path": video_file_path},
|
||||
{"type": "text", "text": "What is shown in this video?"},
|
||||
],
|
||||
},
|
||||
]
|
||||
]
|
||||
|
||||
def _process_messages_for_chat_template(
|
||||
conversation,
|
||||
batch_images,
|
||||
batch_videos,
|
||||
batch_video_metadata,
|
||||
**chat_template_kwargs,
|
||||
):
|
||||
# Let us just always return a dummy prompt
|
||||
new_msg = [
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "video"}, # no need to use path, video is loaded already by this moment
|
||||
{"type": "text", "text": "Dummy prompt for preprocess testing"},
|
||||
],
|
||||
},
|
||||
]
|
||||
]
|
||||
return new_msg
|
||||
|
||||
processor._process_messages_for_chat_template = _process_messages_for_chat_template
|
||||
out_dict_with_video = processor.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
|
||||
# Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
|
||||
formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
|
||||
self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 21960)
|
||||
|
||||
def test_kwargs_overrides_custom_image_processor_kwargs(self):
|
||||
processor = self.get_processor()
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
@@ -99,8 +99,9 @@ class Qwen2VLVideoProcessingTester:
|
||||
}
|
||||
|
||||
@require_vision
|
||||
def expected_output_video_shape(self, videos):
|
||||
grid_t = self.num_frames // self.temporal_patch_size
|
||||
def expected_output_video_shape(self, videos, num_frames=None):
|
||||
num_frames = num_frames if num_frames is not None else self.num_frames
|
||||
grid_t = num_frames // self.temporal_patch_size
|
||||
hidden_dim = self.num_channels * self.temporal_patch_size * self.patch_size * self.patch_size
|
||||
seq_len = 0
|
||||
for video in videos:
|
||||
@@ -289,3 +290,70 @@ class Qwen2VLVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
|
||||
)[self.input_name]
|
||||
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
|
||||
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
def test_call_sample_frames(self):
|
||||
for video_processing_class in self.video_processor_list:
|
||||
video_processing = video_processing_class(**self.video_processor_dict)
|
||||
|
||||
prev_num_frames = self.video_processor_tester.num_frames
|
||||
self.video_processor_tester.num_frames = 8
|
||||
video_inputs = self.video_processor_tester.prepare_video_inputs(
|
||||
equal_resolution=False,
|
||||
return_tensors="torch",
|
||||
)
|
||||
|
||||
# Force set sampling to False. No sampling is expected even when `num_frames` exists
|
||||
video_processing.do_sample_frames = False
|
||||
|
||||
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=3)[self.input_name]
|
||||
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=3)[self.input_name]
|
||||
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
|
||||
expected_output_video_shape_batched = self.video_processor_tester.expected_output_video_shape(video_inputs)
|
||||
self.assertListEqual(list(encoded_videos.shape), expected_output_video_shape)
|
||||
self.assertListEqual(list(encoded_videos_batched.shape), expected_output_video_shape_batched)
|
||||
|
||||
# Set sampling to True. Video frames should be sampled with `num_frames` in the output
|
||||
video_processing.do_sample_frames = True
|
||||
|
||||
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=4)[self.input_name]
|
||||
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=4)[self.input_name]
|
||||
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(
|
||||
[video_inputs[0]], num_frames=4
|
||||
)
|
||||
expected_output_video_shape_batched = self.video_processor_tester.expected_output_video_shape(
|
||||
video_inputs, num_frames=4
|
||||
)
|
||||
self.assertListEqual(list(encoded_videos.shape), expected_output_video_shape)
|
||||
self.assertListEqual(list(encoded_videos_batched.shape), expected_output_video_shape_batched)
|
||||
|
||||
# Sample with `fps` requires metadata to infer number of frames from total duration
|
||||
with self.assertRaises(ValueError):
|
||||
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=3)[self.input_name]
|
||||
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", fps=3)[self.input_name]
|
||||
|
||||
metadata = [[{"duration": 2.0, "total_num_frames": 8, "fps": 4}]]
|
||||
batched_metadata = metadata * len(video_inputs)
|
||||
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=3, video_metadata=metadata)[
|
||||
self.input_name
|
||||
]
|
||||
encoded_videos_batched = video_processing(
|
||||
video_inputs, return_tensors="pt", fps=3, video_metadata=batched_metadata
|
||||
)[self.input_name]
|
||||
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(
|
||||
[video_inputs[0]], num_frames=6
|
||||
)
|
||||
expected_output_video_shape_batched = self.video_processor_tester.expected_output_video_shape(
|
||||
video_inputs, num_frames=6
|
||||
)
|
||||
self.assertListEqual(list(encoded_videos.shape), expected_output_video_shape)
|
||||
self.assertListEqual(list(encoded_videos_batched.shape), expected_output_video_shape_batched)
|
||||
|
||||
# We should raise error when asked to sample more frames than there are in input video
|
||||
with self.assertRaises(ValueError):
|
||||
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=10)[self.input_name]
|
||||
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=10)[
|
||||
self.input_name
|
||||
]
|
||||
|
||||
# Assign back the actual num frames in tester
|
||||
self.video_processor_tester.num_frames = prev_num_frames
|
||||
|
||||
@@ -16,6 +16,7 @@ import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
from io import BytesIO
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
@@ -63,7 +64,7 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
)
|
||||
cls.bos_token = processor.tokenizer.bos_token
|
||||
cls.image_token = processor.image_token
|
||||
cls.video_token = processor.image_token * 8 # SmolVLM uses image token and repeats it `num_frames` times
|
||||
cls.video_token = processor.video_token
|
||||
cls.fake_image_token = processor.fake_image_token
|
||||
cls.global_img_token = processor.global_image_token
|
||||
|
||||
@@ -93,6 +94,13 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
"chat_template": "<|im_start|>{% for message in messages %}{{message['role'] | capitalize}}{% if message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif %}{% for line in message['content'] %}{% if line['type'] == 'text' %}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% endif %}{% endfor %}<end_of_utterance>\n{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
|
||||
}
|
||||
|
||||
def prepare_video_inputs(self, batch_size: Optional[int] = None):
|
||||
"""This function prepares a list of numpy videos."""
|
||||
video_input = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)] * 8
|
||||
if batch_size is None:
|
||||
return [[video_input]]
|
||||
return [[video_input]] * batch_size
|
||||
|
||||
def get_split_image_expected_tokens(self, processor, image_rows, image_cols):
|
||||
text_split_images = []
|
||||
for n_h in range(image_rows):
|
||||
@@ -347,7 +355,6 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
{"type": "text", "text": "What do these images show?"},
|
||||
{"type": "image"},
|
||||
{"type": "image"},
|
||||
"What do these images show?",
|
||||
],
|
||||
},
|
||||
{
|
||||
@@ -373,11 +380,8 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
)
|
||||
self.assertEqual(rendered, expected_rendered)
|
||||
|
||||
@unittest.skip(reason="SmolVLM replaced `type=video` with `type=image` in chat templates")
|
||||
def test_apply_chat_template_video_special_processing(self):
|
||||
pass
|
||||
|
||||
@require_av
|
||||
@require_torch
|
||||
def test_apply_chat_template_video_frame_sampling(self):
|
||||
# overridden because SmolVLM has special preprocessing for videos
|
||||
processor = self.get_processor()
|
||||
@@ -406,7 +410,7 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
num_frames=num_frames,
|
||||
return_tensors="np",
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
|
||||
@@ -421,7 +425,7 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
video_fps=video_fps,
|
||||
return_tensors="np",
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
|
||||
@@ -482,11 +486,11 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
do_rescale=True,
|
||||
rescale_factor=-1,
|
||||
padding="max_length",
|
||||
max_length=76,
|
||||
max_length=172,
|
||||
)
|
||||
|
||||
self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 76)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 172)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
|
||||
@@ -15,22 +15,16 @@
|
||||
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from transformers.image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
||||
from transformers.utils import is_torchvision_available, is_vision_available
|
||||
|
||||
from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_vision_available():
|
||||
if is_torchvision_available():
|
||||
from transformers import SmolVLMVideoProcessor
|
||||
from transformers.models.smolvlm.video_processing_smolvlm import get_resize_output_image_size
|
||||
|
||||
|
||||
class SmolVLMVideoProcessingTester:
|
||||
@@ -58,6 +52,7 @@ class SmolVLMVideoProcessingTester:
|
||||
self.max_resolution = max_resolution
|
||||
self.do_resize = do_resize
|
||||
self.size = size
|
||||
self.max_image_size = size
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean
|
||||
self.image_std = image_std
|
||||
@@ -71,17 +66,16 @@ class SmolVLMVideoProcessingTester:
|
||||
"image_mean": self.image_mean,
|
||||
"image_std": self.image_std,
|
||||
"do_convert_rgb": self.do_convert_rgb,
|
||||
"max_image_size": self.max_image_size,
|
||||
}
|
||||
|
||||
def expected_output_video_shape(self, videos):
|
||||
max_height, max_width = 0, 0
|
||||
if not isinstance(videos[0], torch.Tensor):
|
||||
videos = [torch.tensor(np.array(video)).permute(0, -1, -3, -2) for video in videos]
|
||||
for video in videos:
|
||||
height, width = get_resize_output_image_size(video, self.size["longest_edge"])
|
||||
max_height = max(height, max_height)
|
||||
max_width = max(width, max_width)
|
||||
return [self.num_frames, self.num_channels, max_height, max_width]
|
||||
return [
|
||||
self.num_frames,
|
||||
self.num_channels,
|
||||
self.max_image_size["longest_edge"],
|
||||
self.max_image_size["longest_edge"],
|
||||
]
|
||||
|
||||
def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
|
||||
videos = prepare_video_inputs(
|
||||
@@ -116,3 +110,58 @@ class SmolVLMVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
|
||||
|
||||
video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict, size=42)
|
||||
self.assertEqual(video_processor.size, {"height": 42, "width": 42})
|
||||
|
||||
# overwrite, SmolVLM requires to have metadata no matter how we sample
|
||||
def test_call_sample_frames(self):
|
||||
for video_processing_class in self.video_processor_list:
|
||||
video_processing = video_processing_class(**self.video_processor_dict)
|
||||
|
||||
prev_num_frames = self.video_processor_tester.num_frames
|
||||
self.video_processor_tester.num_frames = 8
|
||||
video_inputs = self.video_processor_tester.prepare_video_inputs(
|
||||
equal_resolution=False,
|
||||
return_tensors="torch",
|
||||
)
|
||||
|
||||
# Force set sampling to False. No sampling is expected even when `num_frames` exists
|
||||
video_processing.do_sample_frames = False
|
||||
|
||||
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=3)[self.input_name]
|
||||
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=3)[self.input_name]
|
||||
self.assertEqual(encoded_videos.shape[1], 8)
|
||||
self.assertEqual(encoded_videos_batched.shape[1], 8)
|
||||
|
||||
# Set sampling to True. Video frames should be sampled with `num_frames` in the output
|
||||
video_processing.do_sample_frames = True
|
||||
metadata = [[{"duration": 2.0, "total_num_frames": 8, "fps": 4}]]
|
||||
batched_metadata = metadata * len(video_inputs)
|
||||
|
||||
# Sample with `fps` requires metadata to infer number of frames from total duration
|
||||
with self.assertRaises(ValueError):
|
||||
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=6, fps=3)[
|
||||
self.input_name
|
||||
]
|
||||
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=6, fps=3)[
|
||||
self.input_name
|
||||
]
|
||||
|
||||
encoded_videos = video_processing(
|
||||
video_inputs[0], return_tensors="pt", num_frames=6, fps=3, video_metadata=metadata
|
||||
)[self.input_name]
|
||||
encoded_videos_batched = video_processing(
|
||||
video_inputs, return_tensors="pt", num_frames=6, fps=3, video_metadata=batched_metadata
|
||||
)[self.input_name]
|
||||
self.assertEqual(encoded_videos.shape[1], 6)
|
||||
self.assertEqual(encoded_videos_batched.shape[1], 6)
|
||||
|
||||
# We should raise error when asked to sample more frames than there are in input video
|
||||
with self.assertRaises(ValueError):
|
||||
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=10, num_frames=20)[
|
||||
self.input_name
|
||||
]
|
||||
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", fps=10, num_frames=20)[
|
||||
self.input_name
|
||||
]
|
||||
|
||||
# Assign back the actual num frames in tester
|
||||
self.video_processor_tester.num_frames = prev_num_frames
|
||||
|
||||
@@ -507,7 +507,7 @@ class ProcessorTesterMixin:
|
||||
if "video_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"video_processor attribute not present in {self.processor_class}")
|
||||
processor_components = self.prepare_components()
|
||||
processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
|
||||
processor_components["tokenizer"] = self.get_component("tokenizer", max_length=167, padding="max_length")
|
||||
processor_kwargs = self.prepare_processor_dict()
|
||||
|
||||
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||
@@ -515,7 +515,7 @@ class ProcessorTesterMixin:
|
||||
input_str = self.prepare_text_inputs(modality="video")
|
||||
video_input = self.prepare_video_inputs()
|
||||
inputs = processor(text=input_str, videos=video_input, return_tensors="pt")
|
||||
self.assertEqual(inputs[self.text_input_name].shape[-1], 117)
|
||||
self.assertEqual(inputs[self.text_input_name].shape[-1], 167)
|
||||
|
||||
def test_video_processor_defaults_preserved_by_video_kwargs(self):
|
||||
"""
|
||||
@@ -529,7 +529,7 @@ class ProcessorTesterMixin:
|
||||
processor_components["video_processor"] = self.get_component(
|
||||
"video_processor", do_rescale=True, rescale_factor=-1
|
||||
)
|
||||
processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
|
||||
processor_components["tokenizer"] = self.get_component("tokenizer", max_length=167, padding="max_length")
|
||||
processor_kwargs = self.prepare_processor_dict()
|
||||
|
||||
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||
@@ -553,9 +553,9 @@ class ProcessorTesterMixin:
|
||||
input_str = self.prepare_text_inputs(modality="video")
|
||||
video_input = self.prepare_video_inputs()
|
||||
inputs = processor(
|
||||
text=input_str, videos=video_input, return_tensors="pt", max_length=112, padding="max_length"
|
||||
text=input_str, videos=video_input, return_tensors="pt", max_length=162, padding="max_length"
|
||||
)
|
||||
self.assertEqual(inputs[self.text_input_name].shape[-1], 112)
|
||||
self.assertEqual(inputs[self.text_input_name].shape[-1], 162)
|
||||
|
||||
def test_kwargs_overrides_default_video_processor_kwargs(self):
|
||||
if "video_processor" not in self.processor_class.attributes:
|
||||
@@ -564,7 +564,7 @@ class ProcessorTesterMixin:
|
||||
processor_components["video_processor"] = self.get_component(
|
||||
"video_processor", do_rescale=True, rescale_factor=1
|
||||
)
|
||||
processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
|
||||
processor_components["tokenizer"] = self.get_component("tokenizer", max_length=167, padding="max_length")
|
||||
processor_kwargs = self.prepare_processor_dict()
|
||||
|
||||
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||
@@ -593,11 +593,11 @@ class ProcessorTesterMixin:
|
||||
do_rescale=True,
|
||||
rescale_factor=-1,
|
||||
padding="max_length",
|
||||
max_length=76,
|
||||
max_length=176,
|
||||
)
|
||||
|
||||
self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
|
||||
self.assertEqual(inputs[self.text_input_name].shape[-1], 76)
|
||||
self.assertEqual(inputs[self.text_input_name].shape[-1], 176)
|
||||
|
||||
def test_unstructured_kwargs_batched_video(self):
|
||||
if "video_processor" not in self.processor_class.attributes:
|
||||
@@ -616,13 +616,13 @@ class ProcessorTesterMixin:
|
||||
do_rescale=True,
|
||||
rescale_factor=-1,
|
||||
padding="longest",
|
||||
max_length=76,
|
||||
max_length=176,
|
||||
)
|
||||
|
||||
self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
|
||||
self.assertTrue(
|
||||
len(inputs[self.text_input_name][0]) == len(inputs[self.text_input_name][1])
|
||||
and len(inputs[self.text_input_name][1]) < 76
|
||||
and len(inputs[self.text_input_name][1]) < 176
|
||||
)
|
||||
|
||||
def test_doubly_passed_kwargs_video(self):
|
||||
@@ -659,14 +659,14 @@ class ProcessorTesterMixin:
|
||||
all_kwargs = {
|
||||
"common_kwargs": {"return_tensors": "pt"},
|
||||
"videos_kwargs": {"do_rescale": True, "rescale_factor": -1},
|
||||
"text_kwargs": {"padding": "max_length", "max_length": 76},
|
||||
"text_kwargs": {"padding": "max_length", "max_length": 176},
|
||||
}
|
||||
|
||||
inputs = processor(text=input_str, videos=video_input, **all_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
|
||||
self.assertEqual(inputs[self.text_input_name].shape[-1], 76)
|
||||
self.assertEqual(inputs[self.text_input_name].shape[-1], 176)
|
||||
|
||||
def test_structured_kwargs_nested_from_dict_video(self):
|
||||
if "video_processor" not in self.processor_class.attributes:
|
||||
@@ -682,12 +682,12 @@ class ProcessorTesterMixin:
|
||||
all_kwargs = {
|
||||
"common_kwargs": {"return_tensors": "pt"},
|
||||
"videos_kwargs": {"do_rescale": True, "rescale_factor": -1},
|
||||
"text_kwargs": {"padding": "max_length", "max_length": 76},
|
||||
"text_kwargs": {"padding": "max_length", "max_length": 176},
|
||||
}
|
||||
|
||||
inputs = processor(text=input_str, videos=video_input, **all_kwargs)
|
||||
self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
|
||||
self.assertEqual(inputs[self.text_input_name].shape[-1], 76)
|
||||
self.assertEqual(inputs[self.text_input_name].shape[-1], 176)
|
||||
|
||||
# TODO: the same test, but for audio + text processors that have strong overlap in kwargs
|
||||
# TODO (molbap) use the same structure of attribute kwargs for other tests to avoid duplication
|
||||
@@ -884,7 +884,7 @@ class ProcessorTesterMixin:
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
return_tensors=return_tensors,
|
||||
num_frames=4, # by default no more than 4 frames, otherwise too slow
|
||||
num_frames=2, # by default no more than 2 frames, otherwise too slow
|
||||
)
|
||||
input_name = getattr(self, input_name)
|
||||
self.assertTrue(input_name in out_dict)
|
||||
@@ -983,6 +983,21 @@ class ProcessorTesterMixin:
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), video_fps * 10)
|
||||
|
||||
# Whan `do_sample_frames=False` no sampling is done and whole video is loaded, even if number of frames is passed
|
||||
video_fps = 1
|
||||
out_dict_with_video = processor.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
do_sample_frames=False,
|
||||
video_fps=video_fps,
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 300)
|
||||
|
||||
# Load with `video_fps` and `num_frames` args, should raise an error
|
||||
with self.assertRaises(ValueError):
|
||||
out_dict_with_video = processor.apply_chat_template(
|
||||
@@ -1024,75 +1039,6 @@ class ProcessorTesterMixin:
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 2)
|
||||
|
||||
@require_av
|
||||
@require_torch
|
||||
def test_apply_chat_template_video_special_processing(self):
|
||||
"""
|
||||
Tests that models can use their own preprocessing to preprocess conversations.
|
||||
"""
|
||||
processor = self.get_processor()
|
||||
if processor.chat_template is None:
|
||||
self.skipTest("Processor has no chat template")
|
||||
|
||||
signature = inspect.signature(processor.__call__)
|
||||
if "videos" not in {*signature.parameters.keys()} or (
|
||||
signature.parameters.get("videos") is not None
|
||||
and signature.parameters["videos"].annotation == inspect._empty
|
||||
):
|
||||
self.skipTest("Processor doesn't accept videos at input")
|
||||
|
||||
video_file_path = hf_hub_download(
|
||||
repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
|
||||
)
|
||||
messages = [
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "video", "path": video_file_path},
|
||||
{"type": "text", "text": "What is shown in this video?"},
|
||||
],
|
||||
},
|
||||
]
|
||||
]
|
||||
|
||||
def _process_messages_for_chat_template(
|
||||
conversation,
|
||||
batch_images,
|
||||
batch_videos,
|
||||
batch_video_metadata,
|
||||
**chat_template_kwargs,
|
||||
):
|
||||
# Let us just always return a dummy prompt
|
||||
new_msg = [
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "video"}, # no need to use path, video is loaded already by this moment
|
||||
{"type": "text", "text": "Dummy prompt for preprocess testing"},
|
||||
],
|
||||
},
|
||||
]
|
||||
]
|
||||
return new_msg
|
||||
|
||||
processor._process_messages_for_chat_template = _process_messages_for_chat_template
|
||||
out_dict_with_video = processor.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
|
||||
# Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
|
||||
formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
|
||||
self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 243)
|
||||
|
||||
@require_librosa
|
||||
@require_av
|
||||
def test_chat_template_audio_from_video(self):
|
||||
|
||||
@@ -293,6 +293,59 @@ class VideoProcessingTestMixin:
|
||||
(self.video_processor_tester.batch_size, *expected_output_video_shape),
|
||||
)
|
||||
|
||||
def test_call_sample_frames(self):
|
||||
for video_processing_class in self.video_processor_list:
|
||||
video_processing = video_processing_class(**self.video_processor_dict)
|
||||
|
||||
prev_num_frames = self.video_processor_tester.num_frames
|
||||
self.video_processor_tester.num_frames = 8
|
||||
video_inputs = self.video_processor_tester.prepare_video_inputs(
|
||||
equal_resolution=False,
|
||||
return_tensors="torch",
|
||||
)
|
||||
|
||||
# Force set sampling to False. No sampling is expected even when `num_frames` exists
|
||||
video_processing.do_sample_frames = False
|
||||
|
||||
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=3)[self.input_name]
|
||||
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=3)[self.input_name]
|
||||
self.assertEqual(encoded_videos.shape[1], 8)
|
||||
self.assertEqual(encoded_videos_batched.shape[1], 8)
|
||||
|
||||
# Set sampling to True. Video frames should be sampled with `num_frames` in the output
|
||||
video_processing.do_sample_frames = True
|
||||
|
||||
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=3)[self.input_name]
|
||||
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=3)[self.input_name]
|
||||
self.assertEqual(encoded_videos.shape[1], 3)
|
||||
self.assertEqual(encoded_videos_batched.shape[1], 3)
|
||||
|
||||
# Sample with `fps` requires metadata to infer number of frames from total duration
|
||||
with self.assertRaises(ValueError):
|
||||
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=3)[self.input_name]
|
||||
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", fps=3)[self.input_name]
|
||||
|
||||
metadata = [[{"duration": 2.0, "total_num_frames": 8, "fps": 4}]]
|
||||
batched_metadata = metadata * len(video_inputs)
|
||||
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=3, video_metadata=metadata)[
|
||||
self.input_name
|
||||
]
|
||||
encoded_videos_batched = video_processing(
|
||||
video_inputs, return_tensors="pt", fps=3, video_metadata=batched_metadata
|
||||
)[self.input_name]
|
||||
self.assertEqual(encoded_videos.shape[1], 6)
|
||||
self.assertEqual(encoded_videos_batched.shape[1], 6)
|
||||
|
||||
# We should raise error when asked to sample more frames than there are in input video
|
||||
with self.assertRaises(ValueError):
|
||||
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=10)[self.input_name]
|
||||
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=10)[
|
||||
self.input_name
|
||||
]
|
||||
|
||||
# Assign back the actual num frames in tester
|
||||
self.video_processor_tester.num_frames = prev_num_frames
|
||||
|
||||
def test_nested_input(self):
|
||||
"""Tests that the processor can work with nested list where each video is a list of arrays"""
|
||||
for video_processing_class in self.video_processor_list:
|
||||
|
||||
Reference in New Issue
Block a user