🔴 Video processors as a separate class (#35206)

* initial design

* update all video processors

* add tests

* need to add qwen2-vl (not tested yet)

* add qwen2-vl in auto map

* fix copies

* isort

* resolve confilicts kinda

* nit:

* qwen2-vl is happy now

* qwen2-5 happy

* other models are happy

* fix copies

* fix tests

* add docs

* CI green now?

* add more tests

* even more changes + tests

* doc builder fail

* nit

* Update src/transformers/models/auto/processing_auto.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>

* small update

* imports correctly

* dump, otherwise this is getting unmanagebale T-T

* dump

* update

* another update

* update

* tests

* move

* modular

* docs

* test

* another update

* init

* remove flakiness in tests

* fixup

* clean up and remove commented lines

* docs

* skip this one!

* last fix after rebasing

* run fixup

* delete slow files

* remove unnecessary tests + clean up a bit

* small fixes

* fix tests

* more updates

* docs

* fix tests

* update

* style

* fix qwen2-5-vl

* fixup

* fixup

* unflatten batch when preparing

* dump, come back soon

* add docs and fix some tests

* how to guard this with new dummies?

* chat templates in qwen

* address some comments

* remove `Fast` suffix

* fixup

* oops should be imported from transforms

* typo in requires dummies

* new model added with video support

* fixup once more

* last fixup I hope

* revert image processor name + comments

* oh, this is why fetch test is failing

* fix tests

* fix more tests

* fixup

* add new models: internvl, smolvlm

* update docs

* imprt once

* fix failing tests

* do we need to guard it here again, why?

* new model was added, update it

* remove testcase from tester

* fix tests

* make style

* not related CI fail, lets' just fix here

* mark flaky for now, filas 15 out of 100

* style

* maybe we can do this way?

* don't download images in setup class

---------

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
This commit is contained in:
Raushan Turganbay
2025-05-12 11:55:51 +02:00
committed by GitHub
parent 716819b830
commit a31fa218ad
83 changed files with 5418 additions and 2004 deletions

View File

@@ -18,12 +18,13 @@ import tempfile
import unittest
from huggingface_hub import hf_hub_download
from parameterized import parameterized
from transformers import AutoProcessor, AutoTokenizer, InternVLProcessor
from transformers.testing_utils import require_av, require_torch, require_vision
from transformers.utils import is_torch_available, is_vision_available
from ...test_processing_common import ProcessorTesterMixin
from ...test_processing_common import MODALITY_INPUT_DATA, ProcessorTesterMixin
if is_torch_available():
@@ -31,7 +32,7 @@ if is_torch_available():
if is_vision_available():
from transformers import GotOcr2ImageProcessor
from transformers import GotOcr2ImageProcessor, InternVLVideoProcessor
@require_vision
@@ -55,12 +56,22 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
image_std=[0.229, 0.224, 0.225],
do_convert_rgb=True,
)
video_processor = InternVLVideoProcessor(
do_resize=True,
size={"height": 20, "width": 20},
do_rescale=True,
rescale_factor=1 / 255,
do_normalize=True,
image_mean=[0.485, 0.456, 0.406],
image_std=[0.229, 0.224, 0.225],
do_convert_rgb=True,
)
tokenizer = AutoTokenizer.from_pretrained("OpenGVLab/InternVL3-1B-hf", padding_side="left")
processor_kwargs = cls.prepare_processor_dict()
processor = InternVLProcessor.from_pretrained(
"OpenGVLab/InternVL3-1B-hf",
processor = InternVLProcessor(
image_processor=image_processor,
tokenizer=tokenizer,
video_processor=video_processor,
**processor_kwargs,
)
processor.save_pretrained(cls.tmpdirname)
@@ -69,7 +80,7 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
@staticmethod
def prepare_processor_dict():
return {"image_seq_length": 10}
return {"image_seq_length": 2}
def get_tokenizer(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
@@ -77,6 +88,9 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
def get_image_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
def get_video_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor
def get_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
@@ -168,6 +182,7 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
# Override video chat_template tests as InternVLProcessor returns flattened video features
@require_av
@require_torch
def test_apply_chat_template_video_special_processing(self):
"""
Tests that models can use their own preprocessing to preprocess conversations.
@@ -225,7 +240,7 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="np",
return_tensors="pt",
num_frames=8,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
@@ -236,6 +251,8 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
# Difference with common tests, InternVLProcessor returns flattened video features, and uses 8 frames by default
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 8)
@require_torch
@require_av
def test_apply_chat_template_video_frame_sampling(self):
processor = self.get_processor()
@@ -271,7 +288,7 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
tokenize=True,
return_dict=True,
num_frames=num_frames,
return_tensors="np",
return_tensors="pt",
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), num_frames)
@@ -284,6 +301,7 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 300)
@@ -302,6 +320,97 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 2)
@require_av
@parameterized.expand([(1, "pt"), (2, "pt")])
def test_apply_chat_template_video(self, batch_size: int, return_tensors: str):
processor = self.get_processor()
if processor.chat_template is None:
self.skipTest("Processor has no chat template")
if "video_processor" not in self.processor_class.attributes:
self.skipTest(f"`video_processor` attribute not present in {self.processor_class}")
batch_messages = [
[
{
"role": "user",
"content": [{"type": "text", "text": "Describe this."}],
},
]
] * batch_size
# Test that jinja can be applied
formatted_prompt = processor.apply_chat_template(batch_messages, add_generation_prompt=True, tokenize=False)
self.assertEqual(len(formatted_prompt), batch_size)
# Test that tokenizing with template and directly with `self.tokenizer` gives same output
formatted_prompt_tokenized = processor.apply_chat_template(
batch_messages, add_generation_prompt=True, tokenize=True, return_tensors="pt"
)
add_special_tokens = True
if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token):
add_special_tokens = False
tok_output = processor.tokenizer(formatted_prompt, return_tensors="pt", add_special_tokens=add_special_tokens)
expected_output = tok_output.input_ids
self.assertListEqual(expected_output.tolist(), formatted_prompt_tokenized.tolist())
# Test that kwargs passed to processor's `__call__` are actually used
tokenized_prompt_100 = processor.apply_chat_template(
batch_messages,
add_generation_prompt=True,
tokenize=True,
padding="max_length",
truncation=True,
return_tensors="pt",
max_length=100,
)
self.assertEqual(len(tokenized_prompt_100[0]), 100)
# Test that `return_dict=True` returns text related inputs in the dict
out_dict_text = processor.apply_chat_template(
batch_messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
)
self.assertTrue(all(key in out_dict_text for key in ["input_ids", "attention_mask"]))
self.assertEqual(len(out_dict_text["input_ids"]), batch_size)
self.assertEqual(len(out_dict_text["attention_mask"]), batch_size)
# Test that with modality URLs and `return_dict=True`, we get modality inputs in the dict
for idx, url in enumerate(MODALITY_INPUT_DATA["videos"][:batch_size]):
batch_messages[idx][0]["content"] = [batch_messages[idx][0]["content"][0], {"type": "video", "url": url}]
out_dict = processor.apply_chat_template(
batch_messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
num_frames=4, # by default no more than 4 frames, otherwise too slow
)
self.assertTrue(self.videos_input_name in out_dict)
self.assertEqual(len(out_dict["input_ids"]), batch_size)
self.assertEqual(len(out_dict["attention_mask"]), batch_size)
video_len = 4 if batch_size == 1 else 3 # InternVL patches out and removes frames after processing
self.assertEqual(len(out_dict[self.videos_input_name]), video_len)
for k in out_dict:
self.assertIsInstance(out_dict[k], torch.Tensor)
# Test continue from final message
assistant_message = {
"role": "assistant",
"content": [{"type": "text", "text": "It is the sound of"}],
}
for batch_idx in range(batch_size):
batch_messages[batch_idx] = batch_messages[batch_idx] + [assistant_message]
continue_prompt = processor.apply_chat_template(batch_messages, continue_final_message=True, tokenize=False)
for prompt in continue_prompt:
self.assertTrue(prompt.endswith("It is the sound of")) # no `eos` token at the end

View File

@@ -0,0 +1,107 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs
if is_torch_available():
pass
if is_vision_available():
if is_torchvision_available():
from transformers import InternVLVideoProcessor
class InternVLVideoProcessingTester:
def __init__(
self,
parent,
batch_size=5,
num_frames=8,
num_channels=3,
min_resolution=30,
max_resolution=80,
do_resize=True,
size=None,
do_normalize=True,
image_mean=OPENAI_CLIP_MEAN,
image_std=OPENAI_CLIP_STD,
do_convert_rgb=True,
):
size = size if size is not None else {"height": 384, "width": 384}
self.parent = parent
self.batch_size = batch_size
self.num_frames = num_frames
self.num_channels = num_channels
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_convert_rgb = do_convert_rgb
def prepare_video_processor_dict(self):
return {
"do_resize": self.do_resize,
"size": self.size,
"do_normalize": self.do_normalize,
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_convert_rgb": self.do_convert_rgb,
}
def expected_output_video_shape(self, videos):
return [self.num_frames, self.num_channels, self.size["height"], self.size["width"]]
def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
videos = prepare_video_inputs(
batch_size=self.batch_size,
num_frames=self.num_frames,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
return_tensors=return_tensors,
)
return videos
@require_torch
@require_vision
class InternVLVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
fast_video_processing_class = InternVLVideoProcessor if is_torchvision_available() else None
def setUp(self):
super().setUp()
self.video_processor_tester = InternVLVideoProcessingTester(self)
@property
def video_processor_dict(self):
return self.video_processor_tester.prepare_video_processor_dict()
def test_video_processor_from_dict_with_kwargs(self):
video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict)
self.assertEqual(video_processor.size, {"height": 384, "width": 384})
video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict, size=42)
self.assertEqual(video_processor.size, {"height": 42, "width": 42})