🔴 Video processors as a separate class (#35206)
* initial design * update all video processors * add tests * need to add qwen2-vl (not tested yet) * add qwen2-vl in auto map * fix copies * isort * resolve confilicts kinda * nit: * qwen2-vl is happy now * qwen2-5 happy * other models are happy * fix copies * fix tests * add docs * CI green now? * add more tests * even more changes + tests * doc builder fail * nit * Update src/transformers/models/auto/processing_auto.py Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com> * small update * imports correctly * dump, otherwise this is getting unmanagebale T-T * dump * update * another update * update * tests * move * modular * docs * test * another update * init * remove flakiness in tests * fixup * clean up and remove commented lines * docs * skip this one! * last fix after rebasing * run fixup * delete slow files * remove unnecessary tests + clean up a bit * small fixes * fix tests * more updates * docs * fix tests * update * style * fix qwen2-5-vl * fixup * fixup * unflatten batch when preparing * dump, come back soon * add docs and fix some tests * how to guard this with new dummies? * chat templates in qwen * address some comments * remove `Fast` suffix * fixup * oops should be imported from transforms * typo in requires dummies * new model added with video support * fixup once more * last fixup I hope * revert image processor name + comments * oh, this is why fetch test is failing * fix tests * fix more tests * fixup * add new models: internvl, smolvlm * update docs * imprt once * fix failing tests * do we need to guard it here again, why? * new model was added, update it * remove testcase from tester * fix tests * make style * not related CI fail, lets' just fix here * mark flaky for now, filas 15 out of 100 * style * maybe we can do this way? * don't download images in setup class --------- Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
This commit is contained in:
committed by
GitHub
parent
716819b830
commit
a31fa218ad
@@ -18,12 +18,13 @@ import tempfile
|
||||
import unittest
|
||||
|
||||
from huggingface_hub import hf_hub_download
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import AutoProcessor, AutoTokenizer, InternVLProcessor
|
||||
from transformers.testing_utils import require_av, require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
|
||||
from ...test_processing_common import ProcessorTesterMixin
|
||||
from ...test_processing_common import MODALITY_INPUT_DATA, ProcessorTesterMixin
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
@@ -31,7 +32,7 @@ if is_torch_available():
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from transformers import GotOcr2ImageProcessor
|
||||
from transformers import GotOcr2ImageProcessor, InternVLVideoProcessor
|
||||
|
||||
|
||||
@require_vision
|
||||
@@ -55,12 +56,22 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
image_std=[0.229, 0.224, 0.225],
|
||||
do_convert_rgb=True,
|
||||
)
|
||||
video_processor = InternVLVideoProcessor(
|
||||
do_resize=True,
|
||||
size={"height": 20, "width": 20},
|
||||
do_rescale=True,
|
||||
rescale_factor=1 / 255,
|
||||
do_normalize=True,
|
||||
image_mean=[0.485, 0.456, 0.406],
|
||||
image_std=[0.229, 0.224, 0.225],
|
||||
do_convert_rgb=True,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained("OpenGVLab/InternVL3-1B-hf", padding_side="left")
|
||||
processor_kwargs = cls.prepare_processor_dict()
|
||||
processor = InternVLProcessor.from_pretrained(
|
||||
"OpenGVLab/InternVL3-1B-hf",
|
||||
processor = InternVLProcessor(
|
||||
image_processor=image_processor,
|
||||
tokenizer=tokenizer,
|
||||
video_processor=video_processor,
|
||||
**processor_kwargs,
|
||||
)
|
||||
processor.save_pretrained(cls.tmpdirname)
|
||||
@@ -69,7 +80,7 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
|
||||
@staticmethod
|
||||
def prepare_processor_dict():
|
||||
return {"image_seq_length": 10}
|
||||
return {"image_seq_length": 2}
|
||||
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
|
||||
@@ -77,6 +88,9 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def get_image_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
|
||||
|
||||
def get_video_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor
|
||||
|
||||
def get_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
@@ -168,6 +182,7 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
|
||||
# Override video chat_template tests as InternVLProcessor returns flattened video features
|
||||
@require_av
|
||||
@require_torch
|
||||
def test_apply_chat_template_video_special_processing(self):
|
||||
"""
|
||||
Tests that models can use their own preprocessing to preprocess conversations.
|
||||
@@ -225,7 +240,7 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
return_tensors="np",
|
||||
return_tensors="pt",
|
||||
num_frames=8,
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
@@ -236,6 +251,8 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
# Difference with common tests, InternVLProcessor returns flattened video features, and uses 8 frames by default
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 8)
|
||||
|
||||
@require_torch
|
||||
@require_av
|
||||
def test_apply_chat_template_video_frame_sampling(self):
|
||||
processor = self.get_processor()
|
||||
|
||||
@@ -271,7 +288,7 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
num_frames=num_frames,
|
||||
return_tensors="np",
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), num_frames)
|
||||
@@ -284,6 +301,7 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 300)
|
||||
@@ -302,6 +320,97 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 2)
|
||||
|
||||
@require_av
|
||||
@parameterized.expand([(1, "pt"), (2, "pt")])
|
||||
def test_apply_chat_template_video(self, batch_size: int, return_tensors: str):
|
||||
processor = self.get_processor()
|
||||
if processor.chat_template is None:
|
||||
self.skipTest("Processor has no chat template")
|
||||
|
||||
if "video_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"`video_processor` attribute not present in {self.processor_class}")
|
||||
|
||||
batch_messages = [
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [{"type": "text", "text": "Describe this."}],
|
||||
},
|
||||
]
|
||||
] * batch_size
|
||||
|
||||
# Test that jinja can be applied
|
||||
formatted_prompt = processor.apply_chat_template(batch_messages, add_generation_prompt=True, tokenize=False)
|
||||
self.assertEqual(len(formatted_prompt), batch_size)
|
||||
|
||||
# Test that tokenizing with template and directly with `self.tokenizer` gives same output
|
||||
formatted_prompt_tokenized = processor.apply_chat_template(
|
||||
batch_messages, add_generation_prompt=True, tokenize=True, return_tensors="pt"
|
||||
)
|
||||
add_special_tokens = True
|
||||
if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token):
|
||||
add_special_tokens = False
|
||||
tok_output = processor.tokenizer(formatted_prompt, return_tensors="pt", add_special_tokens=add_special_tokens)
|
||||
expected_output = tok_output.input_ids
|
||||
self.assertListEqual(expected_output.tolist(), formatted_prompt_tokenized.tolist())
|
||||
|
||||
# Test that kwargs passed to processor's `__call__` are actually used
|
||||
tokenized_prompt_100 = processor.apply_chat_template(
|
||||
batch_messages,
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
padding="max_length",
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
max_length=100,
|
||||
)
|
||||
self.assertEqual(len(tokenized_prompt_100[0]), 100)
|
||||
|
||||
# Test that `return_dict=True` returns text related inputs in the dict
|
||||
out_dict_text = processor.apply_chat_template(
|
||||
batch_messages,
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertTrue(all(key in out_dict_text for key in ["input_ids", "attention_mask"]))
|
||||
self.assertEqual(len(out_dict_text["input_ids"]), batch_size)
|
||||
self.assertEqual(len(out_dict_text["attention_mask"]), batch_size)
|
||||
|
||||
# Test that with modality URLs and `return_dict=True`, we get modality inputs in the dict
|
||||
for idx, url in enumerate(MODALITY_INPUT_DATA["videos"][:batch_size]):
|
||||
batch_messages[idx][0]["content"] = [batch_messages[idx][0]["content"][0], {"type": "video", "url": url}]
|
||||
|
||||
out_dict = processor.apply_chat_template(
|
||||
batch_messages,
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
return_tensors="pt",
|
||||
num_frames=4, # by default no more than 4 frames, otherwise too slow
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict)
|
||||
self.assertEqual(len(out_dict["input_ids"]), batch_size)
|
||||
self.assertEqual(len(out_dict["attention_mask"]), batch_size)
|
||||
|
||||
video_len = 4 if batch_size == 1 else 3 # InternVL patches out and removes frames after processing
|
||||
self.assertEqual(len(out_dict[self.videos_input_name]), video_len)
|
||||
for k in out_dict:
|
||||
self.assertIsInstance(out_dict[k], torch.Tensor)
|
||||
|
||||
# Test continue from final message
|
||||
assistant_message = {
|
||||
"role": "assistant",
|
||||
"content": [{"type": "text", "text": "It is the sound of"}],
|
||||
}
|
||||
for batch_idx in range(batch_size):
|
||||
batch_messages[batch_idx] = batch_messages[batch_idx] + [assistant_message]
|
||||
continue_prompt = processor.apply_chat_template(batch_messages, continue_final_message=True, tokenize=False)
|
||||
for prompt in continue_prompt:
|
||||
self.assertTrue(prompt.endswith("It is the sound of")) # no `eos` token at the end
|
||||
|
||||
107
tests/models/internvl/test_video_processor_internvl.py
Normal file
107
tests/models/internvl/test_video_processor_internvl.py
Normal file
@@ -0,0 +1,107 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
||||
|
||||
from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
pass
|
||||
|
||||
if is_vision_available():
|
||||
if is_torchvision_available():
|
||||
from transformers import InternVLVideoProcessor
|
||||
|
||||
|
||||
class InternVLVideoProcessingTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=5,
|
||||
num_frames=8,
|
||||
num_channels=3,
|
||||
min_resolution=30,
|
||||
max_resolution=80,
|
||||
do_resize=True,
|
||||
size=None,
|
||||
do_normalize=True,
|
||||
image_mean=OPENAI_CLIP_MEAN,
|
||||
image_std=OPENAI_CLIP_STD,
|
||||
do_convert_rgb=True,
|
||||
):
|
||||
size = size if size is not None else {"height": 384, "width": 384}
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.num_frames = num_frames
|
||||
self.num_channels = num_channels
|
||||
self.min_resolution = min_resolution
|
||||
self.max_resolution = max_resolution
|
||||
self.do_resize = do_resize
|
||||
self.size = size
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean
|
||||
self.image_std = image_std
|
||||
self.do_convert_rgb = do_convert_rgb
|
||||
|
||||
def prepare_video_processor_dict(self):
|
||||
return {
|
||||
"do_resize": self.do_resize,
|
||||
"size": self.size,
|
||||
"do_normalize": self.do_normalize,
|
||||
"image_mean": self.image_mean,
|
||||
"image_std": self.image_std,
|
||||
"do_convert_rgb": self.do_convert_rgb,
|
||||
}
|
||||
|
||||
def expected_output_video_shape(self, videos):
|
||||
return [self.num_frames, self.num_channels, self.size["height"], self.size["width"]]
|
||||
|
||||
def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
|
||||
videos = prepare_video_inputs(
|
||||
batch_size=self.batch_size,
|
||||
num_frames=self.num_frames,
|
||||
num_channels=self.num_channels,
|
||||
min_resolution=self.min_resolution,
|
||||
max_resolution=self.max_resolution,
|
||||
equal_resolution=equal_resolution,
|
||||
return_tensors=return_tensors,
|
||||
)
|
||||
return videos
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
class InternVLVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
|
||||
fast_video_processing_class = InternVLVideoProcessor if is_torchvision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.video_processor_tester = InternVLVideoProcessingTester(self)
|
||||
|
||||
@property
|
||||
def video_processor_dict(self):
|
||||
return self.video_processor_tester.prepare_video_processor_dict()
|
||||
|
||||
def test_video_processor_from_dict_with_kwargs(self):
|
||||
video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict)
|
||||
self.assertEqual(video_processor.size, {"height": 384, "width": 384})
|
||||
|
||||
video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict, size=42)
|
||||
self.assertEqual(video_processor.size, {"height": 42, "width": 42})
|
||||
Reference in New Issue
Block a user