🔴 Video processors as a separate class (#35206)
* initial design * update all video processors * add tests * need to add qwen2-vl (not tested yet) * add qwen2-vl in auto map * fix copies * isort * resolve confilicts kinda * nit: * qwen2-vl is happy now * qwen2-5 happy * other models are happy * fix copies * fix tests * add docs * CI green now? * add more tests * even more changes + tests * doc builder fail * nit * Update src/transformers/models/auto/processing_auto.py Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com> * small update * imports correctly * dump, otherwise this is getting unmanagebale T-T * dump * update * another update * update * tests * move * modular * docs * test * another update * init * remove flakiness in tests * fixup * clean up and remove commented lines * docs * skip this one! * last fix after rebasing * run fixup * delete slow files * remove unnecessary tests + clean up a bit * small fixes * fix tests * more updates * docs * fix tests * update * style * fix qwen2-5-vl * fixup * fixup * unflatten batch when preparing * dump, come back soon * add docs and fix some tests * how to guard this with new dummies? * chat templates in qwen * address some comments * remove `Fast` suffix * fixup * oops should be imported from transforms * typo in requires dummies * new model added with video support * fixup once more * last fixup I hope * revert image processor name + comments * oh, this is why fetch test is failing * fix tests * fix more tests * fixup * add new models: internvl, smolvlm * update docs * imprt once * fix failing tests * do we need to guard it here again, why? * new model was added, update it * remove testcase from tester * fix tests * make style * not related CI fail, lets' just fix here * mark flaky for now, filas 15 out of 100 * style * maybe we can do this way? * don't download images in setup class --------- Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
This commit is contained in:
committed by
GitHub
parent
716819b830
commit
a31fa218ad
@@ -73,6 +73,19 @@ class AutoImageProcessorTest(unittest.TestCase):
|
||||
config = AutoImageProcessor.from_pretrained(tmpdirname)
|
||||
self.assertIsInstance(config, CLIPImageProcessor)
|
||||
|
||||
def test_image_processor_from_new_filename(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
|
||||
config_tmpfile = Path(tmpdirname) / "config.json"
|
||||
json.dump(
|
||||
{"image_processor_type": "CLIPImageProcessor", "processor_class": "CLIPProcessor"},
|
||||
open(processor_tmpfile, "w"),
|
||||
)
|
||||
json.dump({"model_type": "clip"}, open(config_tmpfile, "w"))
|
||||
|
||||
config = AutoImageProcessor.from_pretrained(tmpdirname)
|
||||
self.assertIsInstance(config, CLIPImageProcessor)
|
||||
|
||||
def test_image_processor_from_local_directory_from_config(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
model_config = CLIPConfig()
|
||||
|
||||
@@ -40,7 +40,11 @@ from transformers import (
|
||||
)
|
||||
from transformers.testing_utils import TOKEN, TemporaryHubRepo, get_tests_dir, is_staging_test
|
||||
from transformers.tokenization_utils import TOKENIZER_CONFIG_FILE
|
||||
from transformers.utils import FEATURE_EXTRACTOR_NAME, PROCESSOR_NAME, is_tokenizers_available
|
||||
from transformers.utils import (
|
||||
FEATURE_EXTRACTOR_NAME,
|
||||
PROCESSOR_NAME,
|
||||
is_tokenizers_available,
|
||||
)
|
||||
|
||||
|
||||
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
|
||||
@@ -395,6 +399,13 @@ class AutoFeatureExtractorTest(unittest.TestCase):
|
||||
processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-convnext")
|
||||
self.assertEqual(processor.__class__.__name__, "ConvNextImageProcessor")
|
||||
|
||||
def test_auto_processor_save_load(self):
|
||||
processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
processor.save_pretrained(tmp_dir)
|
||||
second_processor = AutoProcessor.from_pretrained(tmp_dir)
|
||||
self.assertEqual(second_processor.__class__.__name__, processor.__class__.__name__)
|
||||
|
||||
|
||||
@is_staging_test
|
||||
class ProcessorPushToHubTester(unittest.TestCase):
|
||||
|
||||
252
tests/models/auto/test_video_processing_auto.py
Normal file
252
tests/models/auto/test_video_processing_auto.py
Normal file
@@ -0,0 +1,252 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import sys
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
import transformers
|
||||
from transformers import (
|
||||
CONFIG_MAPPING,
|
||||
VIDEO_PROCESSOR_MAPPING,
|
||||
AutoConfig,
|
||||
AutoVideoProcessor,
|
||||
LlavaOnevisionConfig,
|
||||
LlavaOnevisionVideoProcessor,
|
||||
)
|
||||
from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, require_torch
|
||||
|
||||
|
||||
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
|
||||
|
||||
from test_module.custom_configuration import CustomConfig # noqa E402
|
||||
from test_module.custom_video_processing import CustomVideoProcessor # noqa E402
|
||||
|
||||
|
||||
@require_torch
|
||||
class AutoVideoProcessorTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
|
||||
|
||||
def test_video_processor_from_model_shortcut(self):
|
||||
config = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
|
||||
self.assertIsInstance(config, LlavaOnevisionVideoProcessor)
|
||||
|
||||
def test_video_processor_from_local_directory_from_key(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
processor_tmpfile = Path(tmpdirname) / "video_preprocessor_config.json"
|
||||
config_tmpfile = Path(tmpdirname) / "config.json"
|
||||
json.dump(
|
||||
{
|
||||
"video_processor_type": "LlavaOnevisionVideoProcessor",
|
||||
"processor_class": "LlavaOnevisionProcessor",
|
||||
},
|
||||
open(processor_tmpfile, "w"),
|
||||
)
|
||||
json.dump({"model_type": "llava_onevision"}, open(config_tmpfile, "w"))
|
||||
|
||||
config = AutoVideoProcessor.from_pretrained(tmpdirname)
|
||||
self.assertIsInstance(config, LlavaOnevisionVideoProcessor)
|
||||
|
||||
def test_video_processor_from_local_directory_from_preprocessor_key(self):
|
||||
# Ensure we can load the image processor from the feature extractor config
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
|
||||
config_tmpfile = Path(tmpdirname) / "config.json"
|
||||
json.dump(
|
||||
{
|
||||
"video_processor_type": "LlavaOnevisionVideoProcessor",
|
||||
"processor_class": "LlavaOnevisionProcessor",
|
||||
},
|
||||
open(processor_tmpfile, "w"),
|
||||
)
|
||||
json.dump({"model_type": "llava_onevision"}, open(config_tmpfile, "w"))
|
||||
|
||||
config = AutoVideoProcessor.from_pretrained(tmpdirname)
|
||||
self.assertIsInstance(config, LlavaOnevisionVideoProcessor)
|
||||
|
||||
def test_video_processor_from_local_directory_from_config(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
model_config = LlavaOnevisionConfig()
|
||||
|
||||
# Create a dummy config file with image_proceesor_type
|
||||
processor_tmpfile = Path(tmpdirname) / "video_preprocessor_config.json"
|
||||
config_tmpfile = Path(tmpdirname) / "config.json"
|
||||
json.dump(
|
||||
{
|
||||
"video_processor_type": "LlavaOnevisionVideoProcessor",
|
||||
"processor_class": "LlavaOnevisionProcessor",
|
||||
},
|
||||
open(processor_tmpfile, "w"),
|
||||
)
|
||||
json.dump({"model_type": "llava_onevision"}, open(config_tmpfile, "w"))
|
||||
|
||||
# remove video_processor_type to make sure config.json alone is enough to load image processor locally
|
||||
config_dict = AutoVideoProcessor.from_pretrained(tmpdirname).to_dict()
|
||||
|
||||
config_dict.pop("video_processor_type")
|
||||
config = LlavaOnevisionVideoProcessor(**config_dict)
|
||||
|
||||
# save in new folder
|
||||
model_config.save_pretrained(tmpdirname)
|
||||
config.save_pretrained(tmpdirname)
|
||||
|
||||
config = AutoVideoProcessor.from_pretrained(tmpdirname)
|
||||
|
||||
# make sure private variable is not incorrectly saved
|
||||
dict_as_saved = json.loads(config.to_json_string())
|
||||
self.assertTrue("_processor_class" not in dict_as_saved)
|
||||
|
||||
self.assertIsInstance(config, LlavaOnevisionVideoProcessor)
|
||||
|
||||
def test_video_processor_from_local_file(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
processor_tmpfile = Path(tmpdirname) / "video_preprocessor_config.json"
|
||||
json.dump(
|
||||
{
|
||||
"video_processor_type": "LlavaOnevisionVideoProcessor",
|
||||
"processor_class": "LlavaOnevisionProcessor",
|
||||
},
|
||||
open(processor_tmpfile, "w"),
|
||||
)
|
||||
|
||||
config = AutoVideoProcessor.from_pretrained(processor_tmpfile)
|
||||
self.assertIsInstance(config, LlavaOnevisionVideoProcessor)
|
||||
|
||||
def test_repo_not_found(self):
|
||||
with self.assertRaisesRegex(
|
||||
EnvironmentError,
|
||||
"llava-hf/llava-doesnt-exist is not a local folder and is not a valid model identifier",
|
||||
):
|
||||
_ = AutoVideoProcessor.from_pretrained("llava-hf/llava-doesnt-exist")
|
||||
|
||||
def test_revision_not_found(self):
|
||||
with self.assertRaisesRegex(
|
||||
EnvironmentError, r"aaaaaa is not a valid git identifier \(branch name, tag name or commit id\)"
|
||||
):
|
||||
_ = AutoVideoProcessor.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa")
|
||||
|
||||
def test_video_processor_not_found(self):
|
||||
with self.assertRaisesRegex(
|
||||
EnvironmentError,
|
||||
"hf-internal-testing/config-no-model does not appear to have a file named preprocessor_config.json.",
|
||||
):
|
||||
_ = AutoVideoProcessor.from_pretrained("hf-internal-testing/config-no-model")
|
||||
|
||||
def test_from_pretrained_dynamic_video_processor(self):
|
||||
# If remote code is not set, we will time out when asking whether to load the model.
|
||||
with self.assertRaises(ValueError):
|
||||
video_processor = AutoVideoProcessor.from_pretrained("hf-internal-testing/test_dynamic_video_processor")
|
||||
# If remote code is disabled, we can't load this config.
|
||||
with self.assertRaises(ValueError):
|
||||
video_processor = AutoVideoProcessor.from_pretrained(
|
||||
"hf-internal-testing/test_dynamic_video_processor", trust_remote_code=False
|
||||
)
|
||||
|
||||
video_processor = AutoVideoProcessor.from_pretrained(
|
||||
"hf-internal-testing/test_dynamic_video_processor", trust_remote_code=True
|
||||
)
|
||||
self.assertEqual(video_processor.__class__.__name__, "NewVideoProcessor")
|
||||
|
||||
# Test the dynamic module is loaded only once.
|
||||
reloaded_video_processor = AutoVideoProcessor.from_pretrained(
|
||||
"hf-internal-testing/test_dynamic_video_processor", trust_remote_code=True
|
||||
)
|
||||
self.assertIs(video_processor.__class__, reloaded_video_processor.__class__)
|
||||
|
||||
# Test image processor can be reloaded.
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
video_processor.save_pretrained(tmp_dir)
|
||||
reloaded_video_processor = AutoVideoProcessor.from_pretrained(tmp_dir, trust_remote_code=True)
|
||||
self.assertEqual(reloaded_video_processor.__class__.__name__, "NewVideoProcessor")
|
||||
|
||||
# The image processor file is cached in the snapshot directory. So the module file is not changed after dumping
|
||||
# to a temp dir. Because the revision of the module file is not changed.
|
||||
# Test the dynamic module is loaded only once if the module file is not changed.
|
||||
self.assertIs(video_processor.__class__, reloaded_video_processor.__class__)
|
||||
|
||||
# Test the dynamic module is reloaded if we force it.
|
||||
reloaded_video_processor = AutoVideoProcessor.from_pretrained(
|
||||
"hf-internal-testing/test_dynamic_video_processor", trust_remote_code=True, force_download=True
|
||||
)
|
||||
self.assertIsNot(video_processor.__class__, reloaded_video_processor.__class__)
|
||||
|
||||
def test_new_video_processor_registration(self):
|
||||
try:
|
||||
AutoConfig.register("custom", CustomConfig)
|
||||
AutoVideoProcessor.register(CustomConfig, CustomVideoProcessor)
|
||||
# Trying to register something existing in the Transformers library will raise an error
|
||||
with self.assertRaises(ValueError):
|
||||
AutoVideoProcessor.register(LlavaOnevisionConfig, LlavaOnevisionVideoProcessor)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
processor_tmpfile = Path(tmpdirname) / "video_preprocessor_config.json"
|
||||
config_tmpfile = Path(tmpdirname) / "config.json"
|
||||
json.dump(
|
||||
{
|
||||
"video_processor_type": "LlavaOnevisionVideoProcessor",
|
||||
"processor_class": "LlavaOnevisionProcessor",
|
||||
},
|
||||
open(processor_tmpfile, "w"),
|
||||
)
|
||||
json.dump({"model_type": "llava_onevision"}, open(config_tmpfile, "w"))
|
||||
|
||||
video_processor = CustomVideoProcessor.from_pretrained(tmpdirname)
|
||||
|
||||
# Now that the config is registered, it can be used as any other config with the auto-API
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
video_processor.save_pretrained(tmp_dir)
|
||||
new_video_processor = AutoVideoProcessor.from_pretrained(tmp_dir)
|
||||
self.assertIsInstance(new_video_processor, CustomVideoProcessor)
|
||||
|
||||
finally:
|
||||
if "custom" in CONFIG_MAPPING._extra_content:
|
||||
del CONFIG_MAPPING._extra_content["custom"]
|
||||
if CustomConfig in VIDEO_PROCESSOR_MAPPING._extra_content:
|
||||
del VIDEO_PROCESSOR_MAPPING._extra_content[CustomConfig]
|
||||
|
||||
def test_from_pretrained_dynamic_video_processor_conflict(self):
|
||||
class NewVideoProcessor(LlavaOnevisionVideoProcessor):
|
||||
is_local = True
|
||||
|
||||
try:
|
||||
AutoConfig.register("custom", CustomConfig)
|
||||
AutoVideoProcessor.register(CustomConfig, NewVideoProcessor)
|
||||
# If remote code is not set, the default is to use local
|
||||
video_processor = AutoVideoProcessor.from_pretrained("hf-internal-testing/test_dynamic_video_processor")
|
||||
self.assertEqual(video_processor.__class__.__name__, "NewVideoProcessor")
|
||||
self.assertTrue(video_processor.is_local)
|
||||
|
||||
# If remote code is disabled, we load the local one.
|
||||
video_processor = AutoVideoProcessor.from_pretrained(
|
||||
"hf-internal-testing/test_dynamic_video_processor", trust_remote_code=False
|
||||
)
|
||||
self.assertEqual(video_processor.__class__.__name__, "NewVideoProcessor")
|
||||
self.assertTrue(video_processor.is_local)
|
||||
|
||||
# If remote is enabled, we load from the Hub
|
||||
video_processor = AutoVideoProcessor.from_pretrained(
|
||||
"hf-internal-testing/test_dynamic_video_processor", trust_remote_code=True
|
||||
)
|
||||
self.assertEqual(video_processor.__class__.__name__, "NewVideoProcessor")
|
||||
self.assertTrue(not hasattr(video_processor, "is_local"))
|
||||
|
||||
finally:
|
||||
if "custom" in CONFIG_MAPPING._extra_content:
|
||||
del CONFIG_MAPPING._extra_content["custom"]
|
||||
if CustomConfig in VIDEO_PROCESSOR_MAPPING._extra_content:
|
||||
del VIDEO_PROCESSOR_MAPPING._extra_content[CustomConfig]
|
||||
@@ -1,190 +0,0 @@
|
||||
# Copyright 2024 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
|
||||
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import InstructBlipVideoImageProcessor
|
||||
|
||||
|
||||
class InstructBlipVideoProcessingTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=5,
|
||||
num_channels=3,
|
||||
image_size=24,
|
||||
min_resolution=30,
|
||||
max_resolution=80,
|
||||
do_resize=True,
|
||||
size=None,
|
||||
do_normalize=True,
|
||||
image_mean=OPENAI_CLIP_MEAN,
|
||||
image_std=OPENAI_CLIP_STD,
|
||||
do_convert_rgb=True,
|
||||
frames=4,
|
||||
):
|
||||
size = size if size is not None else {"height": 18, "width": 18}
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.num_channels = num_channels
|
||||
self.image_size = image_size
|
||||
self.min_resolution = min_resolution
|
||||
self.max_resolution = max_resolution
|
||||
self.do_resize = do_resize
|
||||
self.size = size
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean
|
||||
self.image_std = image_std
|
||||
self.do_convert_rgb = do_convert_rgb
|
||||
self.frames = frames
|
||||
|
||||
def prepare_image_processor_dict(self):
|
||||
return {
|
||||
"do_resize": self.do_resize,
|
||||
"size": self.size,
|
||||
"do_normalize": self.do_normalize,
|
||||
"image_mean": self.image_mean,
|
||||
"image_std": self.image_std,
|
||||
"do_convert_rgb": self.do_convert_rgb,
|
||||
}
|
||||
|
||||
def expected_output_image_shape(self, images):
|
||||
return self.frames, self.num_channels, self.size["height"], self.size["width"]
|
||||
|
||||
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
|
||||
images = prepare_image_inputs(
|
||||
batch_size=self.batch_size,
|
||||
num_channels=self.num_channels,
|
||||
min_resolution=self.min_resolution,
|
||||
max_resolution=self.max_resolution,
|
||||
equal_resolution=equal_resolution,
|
||||
numpify=numpify,
|
||||
torchify=torchify,
|
||||
)
|
||||
|
||||
# let's simply copy the frames to fake a long video-clip
|
||||
if numpify or torchify:
|
||||
videos = []
|
||||
for image in images:
|
||||
if numpify:
|
||||
video = image[None, ...].repeat(self.frames, 0)
|
||||
else:
|
||||
video = image[None, ...].repeat(self.frames, 1, 1, 1)
|
||||
videos.append(video)
|
||||
else:
|
||||
videos = []
|
||||
for pil_image in images:
|
||||
videos.append([pil_image] * self.frames)
|
||||
|
||||
return videos
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
class InstructBlipVideoProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = InstructBlipVideoImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = InstructBlipVideoProcessingTester(self)
|
||||
|
||||
@property
|
||||
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.image_processor_dict
|
||||
def image_processor_dict(self):
|
||||
return self.image_processor_tester.prepare_image_processor_dict()
|
||||
|
||||
def test_image_processor_properties(self):
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
self.assertTrue(hasattr(image_processing, "do_resize"))
|
||||
self.assertTrue(hasattr(image_processing, "size"))
|
||||
self.assertTrue(hasattr(image_processing, "do_normalize"))
|
||||
self.assertTrue(hasattr(image_processing, "image_mean"))
|
||||
self.assertTrue(hasattr(image_processing, "image_std"))
|
||||
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
|
||||
|
||||
def test_image_processor_from_dict_with_kwargs(self):
|
||||
image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
|
||||
self.assertEqual(image_processor.size, {"height": 18, "width": 18})
|
||||
|
||||
image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
|
||||
self.assertEqual(image_processor.size, {"height": 42, "width": 42})
|
||||
|
||||
def test_call_pil(self):
|
||||
# Initialize image_processing
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
# create random numpy tensors
|
||||
video_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
|
||||
for video in video_inputs:
|
||||
self.assertIsInstance(video[0], Image.Image)
|
||||
|
||||
# Test not batched input (pass as `videos` arg to test that ImageProcessor can handle videos in absence of images!)
|
||||
encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values
|
||||
expected_output_video_shape = (1, 4, 3, 18, 18)
|
||||
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
# Test batched
|
||||
encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values
|
||||
expected_output_video_shape = (5, 4, 3, 18, 18)
|
||||
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
def test_call_numpy(self):
|
||||
# Initialize image_processing
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
# create random numpy tensors
|
||||
video_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)
|
||||
for video in video_inputs:
|
||||
self.assertIsInstance(video, np.ndarray)
|
||||
|
||||
# Test not batched input (pass as `videos` arg to test that ImageProcessor can handle videos in absence of images!)
|
||||
encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values
|
||||
expected_output_video_shape = (1, 4, 3, 18, 18)
|
||||
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
# Test batched
|
||||
encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values
|
||||
expected_output_video_shape = (5, 4, 3, 18, 18)
|
||||
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
def test_call_pytorch(self):
|
||||
# Initialize image_processing
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
# create random PyTorch tensors
|
||||
video_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
|
||||
for video in video_inputs:
|
||||
self.assertIsInstance(video, torch.Tensor)
|
||||
|
||||
# Test not batched input
|
||||
encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values
|
||||
expected_output_video_shape = (1, 4, 3, 18, 18)
|
||||
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
# Test batched
|
||||
encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values
|
||||
expected_output_video_shape = (5, 4, 3, 18, 18)
|
||||
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
|
||||
@@ -17,8 +17,8 @@ import unittest
|
||||
|
||||
import pytest
|
||||
|
||||
from transformers.testing_utils import require_vision
|
||||
from transformers.utils import is_vision_available
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torchvision_available, is_vision_available
|
||||
|
||||
from ...test_processing_common import ProcessorTesterMixin
|
||||
|
||||
@@ -28,14 +28,16 @@ if is_vision_available():
|
||||
AutoProcessor,
|
||||
BertTokenizerFast,
|
||||
GPT2Tokenizer,
|
||||
InstructBlipVideoImageProcessor,
|
||||
InstructBlipVideoProcessor,
|
||||
PreTrainedTokenizerFast,
|
||||
)
|
||||
|
||||
if is_torchvision_available():
|
||||
from transformers import InstructBlipVideoVideoProcessor
|
||||
|
||||
|
||||
@require_vision
|
||||
# Copied from tests.models.instructblip.test_processor_instructblip.InstructBlipProcessorTest with InstructBlip->InstructBlipVideo, BlipImageProcessor->InstructBlipVideoImageProcessor
|
||||
@require_torch
|
||||
class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
processor_class = InstructBlipVideoProcessor
|
||||
|
||||
@@ -43,23 +45,23 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def setUpClass(cls):
|
||||
cls.tmpdirname = tempfile.mkdtemp()
|
||||
|
||||
image_processor = InstructBlipVideoImageProcessor()
|
||||
video_processor = InstructBlipVideoVideoProcessor()
|
||||
tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-GPT2Model")
|
||||
qformer_tokenizer = BertTokenizerFast.from_pretrained("hf-internal-testing/tiny-random-bert")
|
||||
|
||||
processor = InstructBlipVideoProcessor(image_processor, tokenizer, qformer_tokenizer)
|
||||
processor = InstructBlipVideoProcessor(video_processor, tokenizer, qformer_tokenizer)
|
||||
|
||||
processor.save_pretrained(cls.tmpdirname)
|
||||
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
|
||||
|
||||
def get_image_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
|
||||
|
||||
def get_qformer_tokenizer(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).qformer_tokenizer
|
||||
|
||||
def get_video_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
||||
@@ -67,14 +69,14 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def test_save_load_pretrained_additional_features(self):
|
||||
processor = InstructBlipVideoProcessor(
|
||||
tokenizer=self.get_tokenizer(),
|
||||
image_processor=self.get_image_processor(),
|
||||
video_processor=self.get_video_processor(),
|
||||
qformer_tokenizer=self.get_qformer_tokenizer(),
|
||||
)
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
processor.save_pretrained(tmpdir)
|
||||
|
||||
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
||||
image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
|
||||
video_processor_add_kwargs = self.get_video_processor(do_normalize=False, padding_value=1.0)
|
||||
|
||||
processor = InstructBlipVideoProcessor.from_pretrained(
|
||||
tmpdir, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
|
||||
@@ -83,34 +85,34 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
||||
self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast)
|
||||
|
||||
self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
|
||||
self.assertIsInstance(processor.image_processor, InstructBlipVideoImageProcessor)
|
||||
self.assertEqual(processor.video_processor.to_json_string(), video_processor_add_kwargs.to_json_string())
|
||||
self.assertIsInstance(processor.video_processor, InstructBlipVideoVideoProcessor)
|
||||
self.assertIsInstance(processor.qformer_tokenizer, BertTokenizerFast)
|
||||
|
||||
def test_image_processor(self):
|
||||
image_processor = self.get_image_processor()
|
||||
def test_video_processor(self):
|
||||
video_processor = self.get_video_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
qformer_tokenizer = self.get_qformer_tokenizer()
|
||||
|
||||
processor = InstructBlipVideoProcessor(
|
||||
tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
|
||||
tokenizer=tokenizer, video_processor=video_processor, qformer_tokenizer=qformer_tokenizer
|
||||
)
|
||||
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
input_feat_extract = image_processor(image_input, return_tensors="np")
|
||||
input_processor = processor(images=image_input, return_tensors="np")
|
||||
input_feat_extract = video_processor(image_input, return_tensors="pt")
|
||||
input_processor = processor(images=image_input, return_tensors="pt")
|
||||
|
||||
for key in input_feat_extract.keys():
|
||||
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
|
||||
|
||||
def test_tokenizer(self):
|
||||
image_processor = self.get_image_processor()
|
||||
video_processor = self.get_video_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
qformer_tokenizer = self.get_qformer_tokenizer()
|
||||
|
||||
processor = InstructBlipVideoProcessor(
|
||||
tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
|
||||
tokenizer=tokenizer, video_processor=video_processor, qformer_tokenizer=qformer_tokenizer
|
||||
)
|
||||
|
||||
input_str = ["lower newer"]
|
||||
@@ -127,12 +129,12 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
self.assertListEqual(encoded_tokens_qformer[key], encoded_processor["qformer_" + key])
|
||||
|
||||
def test_processor(self):
|
||||
image_processor = self.get_image_processor()
|
||||
video_processor = self.get_video_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
qformer_tokenizer = self.get_qformer_tokenizer()
|
||||
|
||||
processor = InstructBlipVideoProcessor(
|
||||
tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
|
||||
tokenizer=tokenizer, video_processor=video_processor, qformer_tokenizer=qformer_tokenizer
|
||||
)
|
||||
|
||||
input_str = "lower newer"
|
||||
@@ -150,12 +152,12 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
processor()
|
||||
|
||||
def test_tokenizer_decode(self):
|
||||
image_processor = self.get_image_processor()
|
||||
video_processor = self.get_video_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
qformer_tokenizer = self.get_qformer_tokenizer()
|
||||
|
||||
processor = InstructBlipVideoProcessor(
|
||||
tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
|
||||
tokenizer=tokenizer, video_processor=video_processor, qformer_tokenizer=qformer_tokenizer
|
||||
)
|
||||
|
||||
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
|
||||
@@ -166,12 +168,12 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
self.assertListEqual(decoded_tok, decoded_processor)
|
||||
|
||||
def test_model_input_names(self):
|
||||
image_processor = self.get_image_processor()
|
||||
video_processor = self.get_video_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
qformer_tokenizer = self.get_qformer_tokenizer()
|
||||
|
||||
processor = InstructBlipVideoProcessor(
|
||||
tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
|
||||
tokenizer=tokenizer, video_processor=video_processor, qformer_tokenizer=qformer_tokenizer
|
||||
)
|
||||
|
||||
input_str = "lower newer"
|
||||
|
||||
@@ -0,0 +1,116 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torchvision_available, is_vision_available
|
||||
|
||||
from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
if is_torchvision_available():
|
||||
from transformers import InstructBlipVideoVideoProcessor
|
||||
|
||||
|
||||
class InstructBlipVideoVideoProcessingTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=5,
|
||||
num_channels=3,
|
||||
num_frames=4,
|
||||
min_resolution=30,
|
||||
max_resolution=80,
|
||||
do_resize=True,
|
||||
size=None,
|
||||
do_normalize=True,
|
||||
image_mean=OPENAI_CLIP_MEAN,
|
||||
image_std=OPENAI_CLIP_STD,
|
||||
do_convert_rgb=True,
|
||||
):
|
||||
super().__init__()
|
||||
size = size if size is not None else {"height": 18, "width": 18}
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.num_frames = num_frames
|
||||
self.num_channels = num_channels
|
||||
self.min_resolution = min_resolution
|
||||
self.max_resolution = max_resolution
|
||||
self.do_resize = do_resize
|
||||
self.size = size
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean
|
||||
self.image_std = image_std
|
||||
self.do_convert_rgb = do_convert_rgb
|
||||
|
||||
def prepare_video_processor_dict(self):
|
||||
return {
|
||||
"do_resize": self.do_resize,
|
||||
"size": self.size,
|
||||
"do_normalize": self.do_normalize,
|
||||
"image_mean": self.image_mean,
|
||||
"image_std": self.image_std,
|
||||
"do_convert_rgb": self.do_convert_rgb,
|
||||
}
|
||||
|
||||
def expected_output_video_shape(self, images):
|
||||
return self.num_frames, self.num_channels, self.size["height"], self.size["width"]
|
||||
|
||||
def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
|
||||
videos = prepare_video_inputs(
|
||||
batch_size=self.batch_size,
|
||||
num_frames=self.num_frames,
|
||||
num_channels=self.num_channels,
|
||||
min_resolution=self.min_resolution,
|
||||
max_resolution=self.max_resolution,
|
||||
equal_resolution=equal_resolution,
|
||||
return_tensors=return_tensors,
|
||||
)
|
||||
|
||||
return videos
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
class InstructBlipVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
|
||||
fast_video_processing_class = InstructBlipVideoVideoProcessor if is_torchvision_available() else None
|
||||
input_name = "pixel_values"
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.video_processor_tester = InstructBlipVideoVideoProcessingTester(self)
|
||||
|
||||
@property
|
||||
def video_processor_dict(self):
|
||||
return self.video_processor_tester.prepare_video_processor_dict()
|
||||
|
||||
def test_image_processor_properties(self):
|
||||
video_processing = self.fast_video_processing_class(**self.video_processor_dict)
|
||||
self.assertTrue(hasattr(video_processing, "do_resize"))
|
||||
self.assertTrue(hasattr(video_processing, "size"))
|
||||
self.assertTrue(hasattr(video_processing, "do_normalize"))
|
||||
self.assertTrue(hasattr(video_processing, "image_mean"))
|
||||
self.assertTrue(hasattr(video_processing, "image_std"))
|
||||
self.assertTrue(hasattr(video_processing, "do_convert_rgb"))
|
||||
|
||||
def test_video_processor_from_dict_with_kwargs(self):
|
||||
video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict)
|
||||
self.assertEqual(video_processor.size, {"height": 18, "width": 18})
|
||||
|
||||
video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict, size=42)
|
||||
self.assertEqual(video_processor.size, {"height": 42, "width": 42})
|
||||
@@ -18,12 +18,13 @@ import tempfile
|
||||
import unittest
|
||||
|
||||
from huggingface_hub import hf_hub_download
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import AutoProcessor, AutoTokenizer, InternVLProcessor
|
||||
from transformers.testing_utils import require_av, require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
|
||||
from ...test_processing_common import ProcessorTesterMixin
|
||||
from ...test_processing_common import MODALITY_INPUT_DATA, ProcessorTesterMixin
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
@@ -31,7 +32,7 @@ if is_torch_available():
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from transformers import GotOcr2ImageProcessor
|
||||
from transformers import GotOcr2ImageProcessor, InternVLVideoProcessor
|
||||
|
||||
|
||||
@require_vision
|
||||
@@ -55,12 +56,22 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
image_std=[0.229, 0.224, 0.225],
|
||||
do_convert_rgb=True,
|
||||
)
|
||||
video_processor = InternVLVideoProcessor(
|
||||
do_resize=True,
|
||||
size={"height": 20, "width": 20},
|
||||
do_rescale=True,
|
||||
rescale_factor=1 / 255,
|
||||
do_normalize=True,
|
||||
image_mean=[0.485, 0.456, 0.406],
|
||||
image_std=[0.229, 0.224, 0.225],
|
||||
do_convert_rgb=True,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained("OpenGVLab/InternVL3-1B-hf", padding_side="left")
|
||||
processor_kwargs = cls.prepare_processor_dict()
|
||||
processor = InternVLProcessor.from_pretrained(
|
||||
"OpenGVLab/InternVL3-1B-hf",
|
||||
processor = InternVLProcessor(
|
||||
image_processor=image_processor,
|
||||
tokenizer=tokenizer,
|
||||
video_processor=video_processor,
|
||||
**processor_kwargs,
|
||||
)
|
||||
processor.save_pretrained(cls.tmpdirname)
|
||||
@@ -69,7 +80,7 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
|
||||
@staticmethod
|
||||
def prepare_processor_dict():
|
||||
return {"image_seq_length": 10}
|
||||
return {"image_seq_length": 2}
|
||||
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
|
||||
@@ -77,6 +88,9 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def get_image_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
|
||||
|
||||
def get_video_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor
|
||||
|
||||
def get_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
@@ -168,6 +182,7 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
|
||||
# Override video chat_template tests as InternVLProcessor returns flattened video features
|
||||
@require_av
|
||||
@require_torch
|
||||
def test_apply_chat_template_video_special_processing(self):
|
||||
"""
|
||||
Tests that models can use their own preprocessing to preprocess conversations.
|
||||
@@ -225,7 +240,7 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
return_tensors="np",
|
||||
return_tensors="pt",
|
||||
num_frames=8,
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
@@ -236,6 +251,8 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
# Difference with common tests, InternVLProcessor returns flattened video features, and uses 8 frames by default
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 8)
|
||||
|
||||
@require_torch
|
||||
@require_av
|
||||
def test_apply_chat_template_video_frame_sampling(self):
|
||||
processor = self.get_processor()
|
||||
|
||||
@@ -271,7 +288,7 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
num_frames=num_frames,
|
||||
return_tensors="np",
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), num_frames)
|
||||
@@ -284,6 +301,7 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 300)
|
||||
@@ -302,6 +320,97 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 2)
|
||||
|
||||
@require_av
|
||||
@parameterized.expand([(1, "pt"), (2, "pt")])
|
||||
def test_apply_chat_template_video(self, batch_size: int, return_tensors: str):
|
||||
processor = self.get_processor()
|
||||
if processor.chat_template is None:
|
||||
self.skipTest("Processor has no chat template")
|
||||
|
||||
if "video_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"`video_processor` attribute not present in {self.processor_class}")
|
||||
|
||||
batch_messages = [
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [{"type": "text", "text": "Describe this."}],
|
||||
},
|
||||
]
|
||||
] * batch_size
|
||||
|
||||
# Test that jinja can be applied
|
||||
formatted_prompt = processor.apply_chat_template(batch_messages, add_generation_prompt=True, tokenize=False)
|
||||
self.assertEqual(len(formatted_prompt), batch_size)
|
||||
|
||||
# Test that tokenizing with template and directly with `self.tokenizer` gives same output
|
||||
formatted_prompt_tokenized = processor.apply_chat_template(
|
||||
batch_messages, add_generation_prompt=True, tokenize=True, return_tensors="pt"
|
||||
)
|
||||
add_special_tokens = True
|
||||
if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token):
|
||||
add_special_tokens = False
|
||||
tok_output = processor.tokenizer(formatted_prompt, return_tensors="pt", add_special_tokens=add_special_tokens)
|
||||
expected_output = tok_output.input_ids
|
||||
self.assertListEqual(expected_output.tolist(), formatted_prompt_tokenized.tolist())
|
||||
|
||||
# Test that kwargs passed to processor's `__call__` are actually used
|
||||
tokenized_prompt_100 = processor.apply_chat_template(
|
||||
batch_messages,
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
padding="max_length",
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
max_length=100,
|
||||
)
|
||||
self.assertEqual(len(tokenized_prompt_100[0]), 100)
|
||||
|
||||
# Test that `return_dict=True` returns text related inputs in the dict
|
||||
out_dict_text = processor.apply_chat_template(
|
||||
batch_messages,
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertTrue(all(key in out_dict_text for key in ["input_ids", "attention_mask"]))
|
||||
self.assertEqual(len(out_dict_text["input_ids"]), batch_size)
|
||||
self.assertEqual(len(out_dict_text["attention_mask"]), batch_size)
|
||||
|
||||
# Test that with modality URLs and `return_dict=True`, we get modality inputs in the dict
|
||||
for idx, url in enumerate(MODALITY_INPUT_DATA["videos"][:batch_size]):
|
||||
batch_messages[idx][0]["content"] = [batch_messages[idx][0]["content"][0], {"type": "video", "url": url}]
|
||||
|
||||
out_dict = processor.apply_chat_template(
|
||||
batch_messages,
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
return_tensors="pt",
|
||||
num_frames=4, # by default no more than 4 frames, otherwise too slow
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict)
|
||||
self.assertEqual(len(out_dict["input_ids"]), batch_size)
|
||||
self.assertEqual(len(out_dict["attention_mask"]), batch_size)
|
||||
|
||||
video_len = 4 if batch_size == 1 else 3 # InternVL patches out and removes frames after processing
|
||||
self.assertEqual(len(out_dict[self.videos_input_name]), video_len)
|
||||
for k in out_dict:
|
||||
self.assertIsInstance(out_dict[k], torch.Tensor)
|
||||
|
||||
# Test continue from final message
|
||||
assistant_message = {
|
||||
"role": "assistant",
|
||||
"content": [{"type": "text", "text": "It is the sound of"}],
|
||||
}
|
||||
for batch_idx in range(batch_size):
|
||||
batch_messages[batch_idx] = batch_messages[batch_idx] + [assistant_message]
|
||||
continue_prompt = processor.apply_chat_template(batch_messages, continue_final_message=True, tokenize=False)
|
||||
for prompt in continue_prompt:
|
||||
self.assertTrue(prompt.endswith("It is the sound of")) # no `eos` token at the end
|
||||
|
||||
107
tests/models/internvl/test_video_processor_internvl.py
Normal file
107
tests/models/internvl/test_video_processor_internvl.py
Normal file
@@ -0,0 +1,107 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
||||
|
||||
from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
pass
|
||||
|
||||
if is_vision_available():
|
||||
if is_torchvision_available():
|
||||
from transformers import InternVLVideoProcessor
|
||||
|
||||
|
||||
class InternVLVideoProcessingTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=5,
|
||||
num_frames=8,
|
||||
num_channels=3,
|
||||
min_resolution=30,
|
||||
max_resolution=80,
|
||||
do_resize=True,
|
||||
size=None,
|
||||
do_normalize=True,
|
||||
image_mean=OPENAI_CLIP_MEAN,
|
||||
image_std=OPENAI_CLIP_STD,
|
||||
do_convert_rgb=True,
|
||||
):
|
||||
size = size if size is not None else {"height": 384, "width": 384}
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.num_frames = num_frames
|
||||
self.num_channels = num_channels
|
||||
self.min_resolution = min_resolution
|
||||
self.max_resolution = max_resolution
|
||||
self.do_resize = do_resize
|
||||
self.size = size
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean
|
||||
self.image_std = image_std
|
||||
self.do_convert_rgb = do_convert_rgb
|
||||
|
||||
def prepare_video_processor_dict(self):
|
||||
return {
|
||||
"do_resize": self.do_resize,
|
||||
"size": self.size,
|
||||
"do_normalize": self.do_normalize,
|
||||
"image_mean": self.image_mean,
|
||||
"image_std": self.image_std,
|
||||
"do_convert_rgb": self.do_convert_rgb,
|
||||
}
|
||||
|
||||
def expected_output_video_shape(self, videos):
|
||||
return [self.num_frames, self.num_channels, self.size["height"], self.size["width"]]
|
||||
|
||||
def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
|
||||
videos = prepare_video_inputs(
|
||||
batch_size=self.batch_size,
|
||||
num_frames=self.num_frames,
|
||||
num_channels=self.num_channels,
|
||||
min_resolution=self.min_resolution,
|
||||
max_resolution=self.max_resolution,
|
||||
equal_resolution=equal_resolution,
|
||||
return_tensors=return_tensors,
|
||||
)
|
||||
return videos
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
class InternVLVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
|
||||
fast_video_processing_class = InternVLVideoProcessor if is_torchvision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.video_processor_tester = InternVLVideoProcessingTester(self)
|
||||
|
||||
@property
|
||||
def video_processor_dict(self):
|
||||
return self.video_processor_tester.prepare_video_processor_dict()
|
||||
|
||||
def test_video_processor_from_dict_with_kwargs(self):
|
||||
video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict)
|
||||
self.assertEqual(video_processor.size, {"height": 384, "width": 384})
|
||||
|
||||
video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict, size=42)
|
||||
self.assertEqual(video_processor.size, {"height": 42, "width": 42})
|
||||
@@ -1,218 +0,0 @@
|
||||
# Copyright 2024 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
|
||||
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import LlavaNextVideoImageProcessor
|
||||
|
||||
|
||||
class LlavaNextVideoProcessingTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=5,
|
||||
num_channels=3,
|
||||
image_size=18,
|
||||
min_resolution=30,
|
||||
max_resolution=80,
|
||||
do_resize=True,
|
||||
size=None,
|
||||
do_center_crop=True,
|
||||
crop_size=None,
|
||||
do_normalize=True,
|
||||
image_mean=OPENAI_CLIP_MEAN,
|
||||
image_std=OPENAI_CLIP_STD,
|
||||
do_convert_rgb=True,
|
||||
):
|
||||
size = size if size is not None else {"shortest_edge": 20}
|
||||
crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.num_channels = num_channels
|
||||
self.image_size = image_size
|
||||
self.min_resolution = min_resolution
|
||||
self.max_resolution = max_resolution
|
||||
self.do_resize = do_resize
|
||||
self.size = size
|
||||
self.do_center_crop = do_center_crop
|
||||
self.crop_size = crop_size
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean
|
||||
self.image_std = image_std
|
||||
self.do_convert_rgb = do_convert_rgb
|
||||
|
||||
def prepare_image_processor_dict(self):
|
||||
return {
|
||||
"do_resize": self.do_resize,
|
||||
"size": self.size,
|
||||
"do_center_crop": self.do_center_crop,
|
||||
"crop_size": self.crop_size,
|
||||
"do_normalize": self.do_normalize,
|
||||
"image_mean": self.image_mean,
|
||||
"image_std": self.image_std,
|
||||
"do_convert_rgb": self.do_convert_rgb,
|
||||
}
|
||||
|
||||
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.expected_output_image_shape
|
||||
def expected_output_image_shape(self, images):
|
||||
return self.num_channels, self.crop_size["height"], self.crop_size["width"]
|
||||
|
||||
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.prepare_image_inputs
|
||||
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
|
||||
return prepare_image_inputs(
|
||||
batch_size=self.batch_size,
|
||||
num_channels=self.num_channels,
|
||||
min_resolution=self.min_resolution,
|
||||
max_resolution=self.max_resolution,
|
||||
equal_resolution=equal_resolution,
|
||||
numpify=numpify,
|
||||
torchify=torchify,
|
||||
)
|
||||
|
||||
def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False):
|
||||
images = prepare_image_inputs(
|
||||
batch_size=self.batch_size,
|
||||
num_channels=self.num_channels,
|
||||
min_resolution=self.min_resolution,
|
||||
max_resolution=self.max_resolution,
|
||||
equal_resolution=equal_resolution,
|
||||
numpify=numpify,
|
||||
torchify=torchify,
|
||||
)
|
||||
|
||||
# let's simply copy the frames to fake a long video-clip
|
||||
if numpify or torchify:
|
||||
videos = []
|
||||
for image in images:
|
||||
if numpify:
|
||||
video = image[None, ...].repeat(8, 0)
|
||||
else:
|
||||
video = image[None, ...].repeat(8, 1, 1, 1)
|
||||
videos.append(video)
|
||||
else:
|
||||
videos = []
|
||||
for pil_image in images:
|
||||
videos.append([pil_image] * 8)
|
||||
|
||||
return videos
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
class LlavaNextVideoProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = LlavaNextVideoImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = LlavaNextVideoProcessingTester(self)
|
||||
|
||||
@property
|
||||
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.image_processor_dict
|
||||
def image_processor_dict(self):
|
||||
return self.image_processor_tester.prepare_image_processor_dict()
|
||||
|
||||
def test_image_processor_properties(self):
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
self.assertTrue(hasattr(image_processing, "do_resize"))
|
||||
self.assertTrue(hasattr(image_processing, "size"))
|
||||
self.assertTrue(hasattr(image_processing, "do_center_crop"))
|
||||
self.assertTrue(hasattr(image_processing, "center_crop"))
|
||||
self.assertTrue(hasattr(image_processing, "do_normalize"))
|
||||
self.assertTrue(hasattr(image_processing, "image_mean"))
|
||||
self.assertTrue(hasattr(image_processing, "image_std"))
|
||||
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
|
||||
|
||||
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.test_image_processor_from_dict_with_kwargs
|
||||
def test_image_processor_from_dict_with_kwargs(self):
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processor = image_processing_class.from_dict(self.image_processor_dict)
|
||||
self.assertEqual(image_processor.size, {"shortest_edge": 20})
|
||||
self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
|
||||
|
||||
image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
|
||||
self.assertEqual(image_processor.size, {"shortest_edge": 42})
|
||||
self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
|
||||
|
||||
def test_call_pil(self):
|
||||
# Initialize image_processing
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
# create random numpy tensors
|
||||
video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True)
|
||||
for video in video_inputs:
|
||||
self.assertIsInstance(video[0], Image.Image)
|
||||
|
||||
# Test not batched input (pass as `videos` arg to test that ImageProcessor can handle videos in absence of images!)
|
||||
encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values_videos
|
||||
expected_output_video_shape = (1, 8, 3, 18, 18)
|
||||
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
# Test batched
|
||||
encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values_videos
|
||||
expected_output_video_shape = (5, 8, 3, 18, 18)
|
||||
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
def test_call_numpy(self):
|
||||
# Initialize image_processing
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
# create random numpy tensors
|
||||
video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True, numpify=True)
|
||||
for video in video_inputs:
|
||||
self.assertIsInstance(video, np.ndarray)
|
||||
|
||||
# Test not batched input (pass as `videos` arg to test that ImageProcessor can handle videos in absence of images!)
|
||||
encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values_videos
|
||||
expected_output_video_shape = (1, 8, 3, 18, 18)
|
||||
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
# Test batched
|
||||
encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values_videos
|
||||
expected_output_video_shape = (5, 8, 3, 18, 18)
|
||||
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
def test_call_pytorch(self):
|
||||
# Initialize image_processing
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
# create random PyTorch tensors
|
||||
video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True, torchify=True)
|
||||
for video in video_inputs:
|
||||
self.assertIsInstance(video, torch.Tensor)
|
||||
|
||||
# Test not batched input
|
||||
encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values_videos
|
||||
expected_output_video_shape = (1, 8, 3, 18, 18)
|
||||
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
# Test batched
|
||||
encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values_videos
|
||||
expected_output_video_shape = (5, 8, 3, 18, 18)
|
||||
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
@unittest.skip("LlavaNextVideoImageProcessor doesn't treat 4 channel PIL and numpy consistently yet")
|
||||
def test_call_numpy_4_channels(self):
|
||||
pass
|
||||
@@ -19,13 +19,16 @@ import unittest
|
||||
|
||||
from transformers import AutoProcessor, LlamaTokenizerFast, LlavaNextVideoProcessor
|
||||
from transformers.testing_utils import require_vision
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
||||
|
||||
from ...test_processing_common import ProcessorTesterMixin
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from transformers import LlavaNextImageProcessor, LlavaNextVideoImageProcessor
|
||||
from transformers import LlavaNextImageProcessor
|
||||
|
||||
if is_torchvision_available():
|
||||
from transformers import LlavaNextVideoVideoProcessor
|
||||
|
||||
if is_torch_available:
|
||||
pass
|
||||
@@ -39,7 +42,7 @@ class LlavaNextVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def setUpClass(cls):
|
||||
cls.tmpdirname = tempfile.mkdtemp()
|
||||
image_processor = LlavaNextImageProcessor()
|
||||
video_processor = LlavaNextVideoImageProcessor()
|
||||
video_processor = LlavaNextVideoVideoProcessor()
|
||||
tokenizer = LlamaTokenizerFast.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
|
||||
tokenizer.add_special_tokens({"additional_special_tokens": ["<image>", "<video>"]})
|
||||
processor_kwargs = cls.prepare_processor_dict()
|
||||
|
||||
@@ -0,0 +1,127 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
||||
|
||||
from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
pass
|
||||
|
||||
if is_vision_available():
|
||||
if is_torchvision_available():
|
||||
from transformers import LlavaNextVideoVideoProcessor
|
||||
|
||||
|
||||
class LlavaNextVideoProcessingTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=5,
|
||||
num_frames=8,
|
||||
num_channels=3,
|
||||
min_resolution=30,
|
||||
max_resolution=80,
|
||||
do_resize=True,
|
||||
size=None,
|
||||
do_center_crop=True,
|
||||
crop_size=None,
|
||||
do_normalize=True,
|
||||
image_mean=OPENAI_CLIP_MEAN,
|
||||
image_std=OPENAI_CLIP_STD,
|
||||
do_convert_rgb=True,
|
||||
):
|
||||
size = size if size is not None else {"height": 20, "width": 20}
|
||||
crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.num_frames = num_frames
|
||||
self.num_channels = num_channels
|
||||
self.min_resolution = min_resolution
|
||||
self.max_resolution = max_resolution
|
||||
self.do_resize = do_resize
|
||||
self.size = size
|
||||
self.do_center_crop = do_center_crop
|
||||
self.crop_size = crop_size
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean
|
||||
self.image_std = image_std
|
||||
self.do_convert_rgb = do_convert_rgb
|
||||
|
||||
def prepare_video_processor_dict(self):
|
||||
return {
|
||||
"do_resize": self.do_resize,
|
||||
"size": self.size,
|
||||
"do_center_crop": self.do_center_crop,
|
||||
"crop_size": self.crop_size,
|
||||
"do_normalize": self.do_normalize,
|
||||
"image_mean": self.image_mean,
|
||||
"image_std": self.image_std,
|
||||
"do_convert_rgb": self.do_convert_rgb,
|
||||
}
|
||||
|
||||
def expected_output_video_shape(self, images):
|
||||
return self.num_frames, self.num_channels, self.crop_size["height"], self.crop_size["width"]
|
||||
|
||||
def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
|
||||
videos = prepare_video_inputs(
|
||||
batch_size=self.batch_size,
|
||||
num_frames=self.num_frames,
|
||||
num_channels=self.num_channels,
|
||||
min_resolution=self.min_resolution,
|
||||
max_resolution=self.max_resolution,
|
||||
equal_resolution=equal_resolution,
|
||||
return_tensors=return_tensors,
|
||||
)
|
||||
return videos
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
class LlavaNextVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
|
||||
fast_video_processing_class = LlavaNextVideoVideoProcessor if is_torchvision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.video_processor_tester = LlavaNextVideoProcessingTester(self)
|
||||
|
||||
@property
|
||||
def video_processor_dict(self):
|
||||
return self.video_processor_tester.prepare_video_processor_dict()
|
||||
|
||||
def test_video_processor_properties(self):
|
||||
video_processing = self.fast_video_processing_class(**self.video_processor_dict)
|
||||
self.assertTrue(hasattr(video_processing, "do_resize"))
|
||||
self.assertTrue(hasattr(video_processing, "size"))
|
||||
self.assertTrue(hasattr(video_processing, "do_center_crop"))
|
||||
self.assertTrue(hasattr(video_processing, "center_crop"))
|
||||
self.assertTrue(hasattr(video_processing, "do_normalize"))
|
||||
self.assertTrue(hasattr(video_processing, "image_mean"))
|
||||
self.assertTrue(hasattr(video_processing, "image_std"))
|
||||
self.assertTrue(hasattr(video_processing, "do_convert_rgb"))
|
||||
|
||||
def test_video_processor_from_dict_with_kwargs(self):
|
||||
video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict)
|
||||
self.assertEqual(video_processor.size, {"height": 20, "width": 20})
|
||||
self.assertEqual(video_processor.crop_size, {"height": 18, "width": 18})
|
||||
|
||||
video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict, size=42, crop_size=84)
|
||||
self.assertEqual(video_processor.size, {"shortest_edge": 42})
|
||||
self.assertEqual(video_processor.crop_size, {"height": 84, "width": 84})
|
||||
@@ -32,7 +32,7 @@ if is_vision_available():
|
||||
from transformers import LlavaOnevisionImageProcessor
|
||||
|
||||
if is_torchvision_available():
|
||||
from transformers import LlavaOnevisionImageProcessorFast, LlavaOnevisionVideoProcessor
|
||||
from transformers import LlavaOnevisionImageProcessorFast
|
||||
|
||||
|
||||
class LlavaOnevisionImageProcessingTester:
|
||||
@@ -91,41 +91,12 @@ class LlavaOnevisionImageProcessingTester:
|
||||
torchify=torchify,
|
||||
)
|
||||
|
||||
# Copied from tests.models.llava_next_video.test_image_processing_llava_next_video.LlavaNextVideoProcessingTester.prepare_video_inputs
|
||||
def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False):
|
||||
images = prepare_image_inputs(
|
||||
batch_size=self.batch_size,
|
||||
num_channels=self.num_channels,
|
||||
min_resolution=self.min_resolution,
|
||||
max_resolution=self.max_resolution,
|
||||
equal_resolution=equal_resolution,
|
||||
numpify=numpify,
|
||||
torchify=torchify,
|
||||
)
|
||||
|
||||
# let's simply copy the frames to fake a long video-clip
|
||||
if numpify or torchify:
|
||||
videos = []
|
||||
for image in images:
|
||||
if numpify:
|
||||
video = image[None, ...].repeat(8, 0)
|
||||
else:
|
||||
video = image[None, ...].repeat(8, 1, 1, 1)
|
||||
videos.append(video)
|
||||
else:
|
||||
videos = []
|
||||
for pil_image in images:
|
||||
videos.append([pil_image] * 8)
|
||||
|
||||
return videos
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
class LlavaOnevisionImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = LlavaOnevisionImageProcessor if is_vision_available() else None
|
||||
fast_image_processing_class = LlavaOnevisionImageProcessorFast if is_torchvision_available() else None
|
||||
video_processing_class = LlavaOnevisionVideoProcessor if is_vision_available() else None
|
||||
|
||||
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->LlavaOnevision
|
||||
def setUp(self):
|
||||
@@ -148,15 +119,6 @@ class LlavaOnevisionImageProcessingTest(ImageProcessingTestMixin, unittest.TestC
|
||||
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
|
||||
self.assertTrue(hasattr(image_processing, "image_grid_pinpoints"))
|
||||
|
||||
def test_video_processor_properties(self):
|
||||
image_processing = self.video_processing_class(**self.image_processor_dict)
|
||||
self.assertTrue(hasattr(image_processing, "do_resize"))
|
||||
self.assertTrue(hasattr(image_processing, "size"))
|
||||
self.assertTrue(hasattr(image_processing, "do_normalize"))
|
||||
self.assertTrue(hasattr(image_processing, "image_mean"))
|
||||
self.assertTrue(hasattr(image_processing, "image_std"))
|
||||
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
|
||||
|
||||
def test_image_processor_from_dict_with_kwargs(self):
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processor = image_processing_class.from_dict(self.image_processor_dict)
|
||||
@@ -248,58 +210,6 @@ class LlavaOnevisionImageProcessingTest(ImageProcessingTestMixin, unittest.TestC
|
||||
# Image processor should return same pixel values, independently of input format
|
||||
self.assertTrue((encoded_images_nested == encoded_images).all())
|
||||
|
||||
def test_call_pil_video(self):
|
||||
# Initialize image_processing
|
||||
video_processing = self.video_processing_class(**self.image_processor_dict)
|
||||
# create random numpy tensors
|
||||
video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True)
|
||||
for video in video_inputs:
|
||||
self.assertIsInstance(video[0], Image.Image)
|
||||
|
||||
encoded_videos = video_processing(video_inputs[0], return_tensors="pt").pixel_values_videos
|
||||
expected_output_video_shape = (1, 8, 3, 20, 20)
|
||||
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
# Test batched
|
||||
encoded_videos = video_processing(video_inputs, return_tensors="pt").pixel_values_videos
|
||||
expected_output_video_shape = (7, 8, 3, 20, 20)
|
||||
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
def test_call_numpy_video(self):
|
||||
# Initialize image_processing
|
||||
video_processing = self.video_processing_class(**self.image_processor_dict)
|
||||
# create random numpy tensors
|
||||
video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True, numpify=True)
|
||||
for video in video_inputs:
|
||||
self.assertIsInstance(video, np.ndarray)
|
||||
|
||||
encoded_videos = video_processing(video_inputs[0], return_tensors="pt").pixel_values_videos
|
||||
expected_output_video_shape = (1, 8, 3, 20, 20)
|
||||
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
# Test batched
|
||||
encoded_videos = video_processing(video_inputs, return_tensors="pt").pixel_values_videos
|
||||
expected_output_video_shape = (7, 8, 3, 20, 20)
|
||||
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
def test_call_pytorch_video(self):
|
||||
# Initialize image_processing
|
||||
video_processing = self.video_processing_class(**self.image_processor_dict)
|
||||
# create random PyTorch tensors
|
||||
video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True, torchify=True)
|
||||
for video in video_inputs:
|
||||
self.assertIsInstance(video, torch.Tensor)
|
||||
|
||||
# Test not batched input
|
||||
encoded_videos = video_processing(video_inputs[0], return_tensors="pt").pixel_values_videos
|
||||
expected_output_video_shape = (1, 8, 3, 20, 20)
|
||||
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
# Test batched
|
||||
encoded_videos = video_processing(video_inputs, return_tensors="pt").pixel_values_videos
|
||||
expected_output_video_shape = (7, 8, 3, 20, 20)
|
||||
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
@unittest.skip(
|
||||
reason="LlavaOnevisionImageProcessorFast doesn't compile (infinitely) when using class transforms"
|
||||
) # FIXME yoni
|
||||
|
||||
@@ -16,8 +16,8 @@ import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers.testing_utils import require_vision
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
||||
|
||||
from ...test_processing_common import ProcessorTesterMixin
|
||||
|
||||
@@ -27,15 +27,18 @@ if is_vision_available():
|
||||
AutoProcessor,
|
||||
LlavaOnevisionImageProcessor,
|
||||
LlavaOnevisionProcessor,
|
||||
LlavaOnevisionVideoProcessor,
|
||||
Qwen2TokenizerFast,
|
||||
)
|
||||
|
||||
if is_torchvision_available():
|
||||
from transformers import LlavaOnevisionVideoProcessor
|
||||
|
||||
if is_torch_available:
|
||||
pass
|
||||
|
||||
|
||||
@require_vision
|
||||
@require_torch
|
||||
class LlavaOnevisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
processor_class = LlavaOnevisionProcessor
|
||||
|
||||
|
||||
@@ -0,0 +1,116 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
||||
|
||||
from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
pass
|
||||
|
||||
if is_vision_available():
|
||||
if is_torchvision_available():
|
||||
from transformers import LlavaOnevisionVideoProcessor
|
||||
|
||||
|
||||
class LlavaOnevisionVideoProcessingTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=7,
|
||||
num_frames=8,
|
||||
num_channels=3,
|
||||
min_resolution=30,
|
||||
max_resolution=400,
|
||||
do_resize=True,
|
||||
size=None,
|
||||
do_normalize=True,
|
||||
image_mean=OPENAI_CLIP_MEAN,
|
||||
image_std=OPENAI_CLIP_STD,
|
||||
do_convert_rgb=True,
|
||||
):
|
||||
size = size if size is not None else {"height": 20, "width": 20}
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.num_frames = num_frames
|
||||
self.num_channels = num_channels
|
||||
self.min_resolution = min_resolution
|
||||
self.max_resolution = max_resolution
|
||||
self.do_resize = do_resize
|
||||
self.size = size
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean
|
||||
self.image_std = image_std
|
||||
self.do_convert_rgb = do_convert_rgb
|
||||
|
||||
def prepare_video_processor_dict(self):
|
||||
return {
|
||||
"do_resize": self.do_resize,
|
||||
"size": self.size,
|
||||
"do_normalize": self.do_normalize,
|
||||
"image_mean": self.image_mean,
|
||||
"image_std": self.image_std,
|
||||
"do_convert_rgb": self.do_convert_rgb,
|
||||
}
|
||||
|
||||
def expected_output_video_shape(self, video):
|
||||
return self.num_frames, self.num_channels, self.size["height"], self.size["width"]
|
||||
|
||||
def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
|
||||
videos = prepare_video_inputs(
|
||||
batch_size=self.batch_size,
|
||||
num_frames=self.num_frames,
|
||||
num_channels=self.num_channels,
|
||||
min_resolution=self.min_resolution,
|
||||
max_resolution=self.max_resolution,
|
||||
equal_resolution=equal_resolution,
|
||||
return_tensors=return_tensors,
|
||||
)
|
||||
return videos
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
class LlavaOnevisionVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
|
||||
fast_video_processing_class = LlavaOnevisionVideoProcessor if is_torchvision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.video_processor_tester = LlavaOnevisionVideoProcessingTester(self)
|
||||
|
||||
@property
|
||||
def video_processor_dict(self):
|
||||
return self.video_processor_tester.prepare_video_processor_dict()
|
||||
|
||||
def test_video_processor_properties(self):
|
||||
video_processing = self.fast_video_processing_class(**self.video_processor_dict)
|
||||
self.assertTrue(hasattr(video_processing, "do_resize"))
|
||||
self.assertTrue(hasattr(video_processing, "size"))
|
||||
self.assertTrue(hasattr(video_processing, "do_normalize"))
|
||||
self.assertTrue(hasattr(video_processing, "image_mean"))
|
||||
self.assertTrue(hasattr(video_processing, "image_std"))
|
||||
self.assertTrue(hasattr(video_processing, "do_convert_rgb"))
|
||||
|
||||
def test_video_processor_from_dict_with_kwargs(self):
|
||||
video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict)
|
||||
self.assertEqual(video_processor.size, {"height": 20, "width": 20})
|
||||
|
||||
video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict, size=42)
|
||||
self.assertEqual(video_processor.size, {"shortest_edge": 42})
|
||||
@@ -16,7 +16,7 @@ import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
import requests
|
||||
import numpy as np
|
||||
|
||||
from transformers import PixtralProcessor
|
||||
from transformers.testing_utils import require_vision
|
||||
@@ -30,7 +30,7 @@ if is_torch_available():
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
pass
|
||||
|
||||
|
||||
@require_vision
|
||||
@@ -42,11 +42,10 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.url_0 = "https://www.ilankelman.org/stopsigns/australia.jpg"
|
||||
cls.image_0 = Image.open(requests.get(cls.url_0, stream=True).raw)
|
||||
cls.image_0 = np.random.randint(255, size=(3, 876, 1300), dtype=np.uint8)
|
||||
cls.url_1 = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
cls.image_1 = Image.open(requests.get(cls.url_1, stream=True).raw)
|
||||
cls.url_2 = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
|
||||
cls.image_2 = Image.open(requests.get(cls.url_2, stream=True).raw)
|
||||
cls.image_1 = np.random.randint(255, size=(3, 480, 640), dtype=np.uint8)
|
||||
cls.image_2 = np.random.randint(255, size=(3, 1024, 1024), dtype=np.uint8)
|
||||
|
||||
cls.tmpdirname = tempfile.mkdtemp()
|
||||
cls.addClassCleanup(lambda tempdir=cls.tmpdirname: shutil.rmtree(tempdir))
|
||||
|
||||
@@ -15,7 +15,7 @@ import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
import requests
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from transformers.testing_utils import require_vision
|
||||
@@ -25,8 +25,6 @@ from ...test_processing_common import ProcessorTesterMixin
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import PixtralProcessor
|
||||
|
||||
|
||||
@@ -37,11 +35,10 @@ class PixtralProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.url_0 = "https://www.ilankelman.org/stopsigns/australia.jpg"
|
||||
cls.image_0 = Image.open(requests.get(cls.url_0, stream=True).raw)
|
||||
cls.image_0 = np.random.randint(255, size=(3, 876, 1300), dtype=np.uint8)
|
||||
cls.url_1 = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
cls.image_1 = Image.open(requests.get(cls.url_1, stream=True).raw)
|
||||
cls.url_2 = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
|
||||
cls.image_2 = Image.open(requests.get(cls.url_2, stream=True).raw)
|
||||
cls.image_1 = np.random.randint(255, size=(3, 480, 640), dtype=np.uint8)
|
||||
cls.image_2 = np.random.randint(255, size=(3, 1024, 1024), dtype=np.uint8)
|
||||
|
||||
def setUp(self):
|
||||
self.tmpdirname = tempfile.mkdtemp()
|
||||
|
||||
@@ -64,8 +64,12 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
video_processor = self.get_component("video_processor")
|
||||
processor = self.processor_class(
|
||||
tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor
|
||||
tokenizer=tokenizer,
|
||||
video_processor=video_processor,
|
||||
feature_extractor=feature_extractor,
|
||||
image_processor=image_processor,
|
||||
)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
input_str = "lower newer"
|
||||
@@ -91,8 +95,12 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
video_processor = self.get_component("video_processor")
|
||||
processor = self.processor_class(
|
||||
tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor
|
||||
tokenizer=tokenizer,
|
||||
video_processor=video_processor,
|
||||
feature_extractor=feature_extractor,
|
||||
image_processor=image_processor,
|
||||
)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
@@ -125,8 +133,12 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
video_processor = self.get_component("video_processor")
|
||||
processor = self.processor_class(
|
||||
tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor
|
||||
tokenizer=tokenizer,
|
||||
video_processor=video_processor,
|
||||
feature_extractor=feature_extractor,
|
||||
image_processor=image_processor,
|
||||
)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
@@ -159,7 +171,13 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor)
|
||||
video_processor = self.get_component("video_processor")
|
||||
_ = self.processor_class(
|
||||
tokenizer=tokenizer,
|
||||
video_processor=video_processor,
|
||||
feature_extractor=feature_extractor,
|
||||
image_processor=image_processor,
|
||||
) # Why delete test? TODO: raushan double check tests after cleaning model
|
||||
|
||||
@require_torch
|
||||
def test_kwargs_overrides_default_tokenizer_kwargs_audio(self):
|
||||
@@ -175,7 +193,13 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor)
|
||||
video_processor = self.get_component("video_processor")
|
||||
_ = self.processor_class(
|
||||
tokenizer=tokenizer,
|
||||
video_processor=video_processor,
|
||||
feature_extractor=feature_extractor,
|
||||
image_processor=image_processor,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
@@ -190,6 +214,9 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def get_image_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
|
||||
|
||||
def get_video_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor
|
||||
|
||||
def get_feature_extractor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).feature_extractor
|
||||
|
||||
@@ -212,10 +239,14 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
image_processor = self.get_image_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
|
||||
processor = Qwen2_5OmniProcessor(
|
||||
image_processor=image_processor, feature_extractor=feature_extractor, tokenizer=tokenizer
|
||||
video_processor = self.get_video_processor()
|
||||
processor = self.processor_class(
|
||||
tokenizer=tokenizer,
|
||||
video_processor=video_processor,
|
||||
feature_extractor=feature_extractor,
|
||||
image_processor=image_processor,
|
||||
)
|
||||
|
||||
processor.save_pretrained(self.tmpdirname)
|
||||
processor = Qwen2_5OmniProcessor.from_pretrained(self.tmpdirname, use_fast=False)
|
||||
|
||||
@@ -230,9 +261,12 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
image_processor = self.get_image_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
|
||||
processor = Qwen2_5OmniProcessor(
|
||||
image_processor=image_processor, feature_extractor=feature_extractor, tokenizer=tokenizer
|
||||
video_processor = self.get_video_processor()
|
||||
processor = self.processor_class(
|
||||
tokenizer=tokenizer,
|
||||
video_processor=video_processor,
|
||||
feature_extractor=feature_extractor,
|
||||
image_processor=image_processor,
|
||||
)
|
||||
|
||||
image_input = self.prepare_image_inputs()
|
||||
@@ -247,9 +281,12 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
image_processor = self.get_image_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
|
||||
processor = Qwen2_5OmniProcessor(
|
||||
image_processor=image_processor, feature_extractor=feature_extractor, tokenizer=tokenizer
|
||||
video_processor = self.get_video_processor()
|
||||
processor = self.processor_class(
|
||||
tokenizer=tokenizer,
|
||||
video_processor=video_processor,
|
||||
feature_extractor=feature_extractor,
|
||||
image_processor=image_processor,
|
||||
)
|
||||
|
||||
input_str = "lower newer"
|
||||
@@ -281,9 +318,12 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
image_processor = self.get_image_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
|
||||
processor = Qwen2_5OmniProcessor(
|
||||
image_processor=image_processor, feature_extractor=feature_extractor, tokenizer=tokenizer
|
||||
video_processor = self.get_video_processor()
|
||||
processor = self.processor_class(
|
||||
tokenizer=tokenizer,
|
||||
video_processor=video_processor,
|
||||
feature_extractor=feature_extractor,
|
||||
image_processor=image_processor,
|
||||
)
|
||||
|
||||
input_str = "lower newer"
|
||||
@@ -377,7 +417,10 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
self.assertTrue(input_name in out_dict)
|
||||
self.assertEqual(len(out_dict["input_ids"]), batch_size)
|
||||
self.assertEqual(len(out_dict["attention_mask"]), batch_size)
|
||||
self.assertEqual(len(out_dict[input_name]), batch_size * 1564)
|
||||
|
||||
video_len = 5760 if batch_size == 1 else 5808 # qwen pixels don't scale with bs same way as other models
|
||||
mm_len = batch_size * 1564 if modality == "image" else video_len
|
||||
self.assertEqual(len(out_dict[input_name]), mm_len)
|
||||
|
||||
return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list}
|
||||
for k in out_dict:
|
||||
|
||||
@@ -55,6 +55,9 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def get_image_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
|
||||
|
||||
def get_video_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor
|
||||
|
||||
@staticmethod
|
||||
def prepare_processor_dict():
|
||||
return {
|
||||
@@ -68,8 +71,11 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def test_save_load_pretrained_default(self):
|
||||
tokenizer = self.get_tokenizer()
|
||||
image_processor = self.get_image_processor()
|
||||
video_processor = self.get_video_processor()
|
||||
|
||||
processor = Qwen2_5_VLProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||
processor = Qwen2_5_VLProcessor(
|
||||
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
|
||||
)
|
||||
processor.save_pretrained(self.tmpdirname)
|
||||
processor = Qwen2_5_VLProcessor.from_pretrained(self.tmpdirname, use_fast=False)
|
||||
|
||||
@@ -81,8 +87,11 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def test_image_processor(self):
|
||||
image_processor = self.get_image_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
video_processor = self.get_video_processor()
|
||||
|
||||
processor = Qwen2_5_VLProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||
processor = Qwen2_5_VLProcessor(
|
||||
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
|
||||
)
|
||||
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
@@ -95,8 +104,11 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def test_processor(self):
|
||||
image_processor = self.get_image_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
video_processor = self.get_video_processor()
|
||||
|
||||
processor = Qwen2_5_VLProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||
processor = Qwen2_5_VLProcessor(
|
||||
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
|
||||
)
|
||||
|
||||
input_str = "lower newer"
|
||||
image_input = self.prepare_image_inputs()
|
||||
@@ -118,8 +130,11 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def test_model_input_names(self):
|
||||
image_processor = self.get_image_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
video_processor = self.get_video_processor()
|
||||
|
||||
processor = Qwen2_5_VLProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||
processor = Qwen2_5_VLProcessor(
|
||||
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
|
||||
)
|
||||
|
||||
input_str = "lower newer"
|
||||
image_input = self.prepare_image_inputs()
|
||||
@@ -130,6 +145,7 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
self.assertListEqual(list(inputs.keys()), processor.model_input_names)
|
||||
|
||||
@require_torch
|
||||
@require_av
|
||||
def _test_apply_chat_template(
|
||||
self,
|
||||
modality: str,
|
||||
@@ -212,7 +228,10 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
self.assertTrue(input_name in out_dict)
|
||||
self.assertEqual(len(out_dict["input_ids"]), batch_size)
|
||||
self.assertEqual(len(out_dict["attention_mask"]), batch_size)
|
||||
self.assertEqual(len(out_dict[input_name]), batch_size * 192)
|
||||
|
||||
video_len = 360 if batch_size == 1 else 320 # qwen pixels don't scale with bs same way as other models
|
||||
mm_len = batch_size * 192 if modality == "image" else video_len
|
||||
self.assertEqual(len(out_dict[input_name]), mm_len)
|
||||
|
||||
return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list}
|
||||
for k in out_dict:
|
||||
@@ -394,7 +413,7 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
return_tensors="np",
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@ import requests
|
||||
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
|
||||
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
|
||||
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs, prepare_video_inputs
|
||||
|
||||
@@ -34,8 +34,8 @@ if is_vision_available():
|
||||
|
||||
from transformers import Qwen2VLImageProcessor
|
||||
|
||||
if is_torchvision_available():
|
||||
from transformers import Qwen2VLImageProcessorFast
|
||||
# if is_torchvision_available():
|
||||
# from transformers import Qwen2VLImageProcessorFast
|
||||
|
||||
|
||||
class Qwen2VLImageProcessingTester:
|
||||
@@ -118,7 +118,7 @@ class Qwen2VLImageProcessingTester:
|
||||
@require_vision
|
||||
class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = Qwen2VLImageProcessor if is_vision_available() else None
|
||||
fast_image_processing_class = Qwen2VLImageProcessorFast if is_torchvision_available() else None
|
||||
# fast_image_processing_class = Qwen2VLImageProcessorFast if is_torchvision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
|
||||
@@ -23,7 +23,7 @@ from huggingface_hub import hf_hub_download
|
||||
|
||||
from transformers import AutoProcessor, Qwen2Tokenizer
|
||||
from transformers.testing_utils import require_av, require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
||||
|
||||
from ...test_processing_common import ProcessorTesterMixin
|
||||
|
||||
@@ -31,6 +31,9 @@ from ...test_processing_common import ProcessorTesterMixin
|
||||
if is_vision_available():
|
||||
from transformers import Qwen2VLImageProcessor, Qwen2VLProcessor
|
||||
|
||||
if is_torchvision_available():
|
||||
from transformers import Qwen2VLVideoProcessor
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
@@ -55,6 +58,9 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def get_image_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
|
||||
|
||||
def get_video_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor
|
||||
|
||||
@staticmethod
|
||||
def prepare_processor_dict():
|
||||
return {"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"} # fmt: skip
|
||||
@@ -66,8 +72,11 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def test_save_load_pretrained_default(self):
|
||||
tokenizer = self.get_tokenizer()
|
||||
image_processor = self.get_image_processor()
|
||||
video_processor = self.get_video_processor()
|
||||
|
||||
processor = Qwen2VLProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||
processor = Qwen2VLProcessor(
|
||||
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
|
||||
)
|
||||
processor.save_pretrained(self.tmpdirname)
|
||||
processor = Qwen2VLProcessor.from_pretrained(self.tmpdirname, use_fast=False)
|
||||
|
||||
@@ -75,12 +84,16 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
|
||||
self.assertIsInstance(processor.tokenizer, Qwen2Tokenizer)
|
||||
self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessor)
|
||||
self.assertIsInstance(processor.video_processor, Qwen2VLVideoProcessor)
|
||||
|
||||
def test_image_processor(self):
|
||||
image_processor = self.get_image_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
video_processor = self.get_video_processor()
|
||||
|
||||
processor = Qwen2VLProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||
processor = Qwen2VLProcessor(
|
||||
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
|
||||
)
|
||||
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
@@ -93,8 +106,11 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def test_processor(self):
|
||||
image_processor = self.get_image_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
video_processor = self.get_video_processor()
|
||||
|
||||
processor = Qwen2VLProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||
processor = Qwen2VLProcessor(
|
||||
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
|
||||
)
|
||||
|
||||
input_str = "lower newer"
|
||||
image_input = self.prepare_image_inputs()
|
||||
@@ -113,8 +129,11 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def test_model_input_names(self):
|
||||
image_processor = self.get_image_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
video_processor = self.get_video_processor()
|
||||
|
||||
processor = Qwen2VLProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||
processor = Qwen2VLProcessor(
|
||||
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
|
||||
)
|
||||
|
||||
input_str = "lower newer"
|
||||
image_input = self.prepare_image_inputs()
|
||||
@@ -125,6 +144,7 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
self.assertListEqual(list(inputs.keys()), processor.model_input_names)
|
||||
|
||||
@require_torch
|
||||
@require_av
|
||||
def _test_apply_chat_template(
|
||||
self,
|
||||
modality: str,
|
||||
@@ -207,7 +227,10 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
self.assertTrue(input_name in out_dict)
|
||||
self.assertEqual(len(out_dict["input_ids"]), batch_size)
|
||||
self.assertEqual(len(out_dict["attention_mask"]), batch_size)
|
||||
self.assertEqual(len(out_dict[input_name]), batch_size * 192)
|
||||
|
||||
video_len = 360 if batch_size == 1 else 320 # qwen pixels don't scale with bs same way as other models
|
||||
mm_len = batch_size * 192 if modality == "image" else video_len
|
||||
self.assertEqual(len(out_dict[input_name]), mm_len)
|
||||
|
||||
return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list}
|
||||
for k in out_dict:
|
||||
@@ -373,7 +396,7 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
return_tensors="np",
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
|
||||
|
||||
291
tests/models/qwen2_vl/test_video_processing_qwen2_vl.py
Normal file
291
tests/models/qwen2_vl/test_video_processing_qwen2_vl.py
Normal file
@@ -0,0 +1,291 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
||||
|
||||
from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers.image_utils import get_image_size
|
||||
from transformers.models.qwen2_vl.video_processing_qwen2_vl import smart_resize
|
||||
|
||||
if is_torchvision_available():
|
||||
from transformers import Qwen2VLVideoProcessor
|
||||
|
||||
|
||||
class Qwen2VLVideoProcessingTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=5,
|
||||
num_frames=8,
|
||||
num_channels=3,
|
||||
min_resolution=30,
|
||||
max_resolution=80,
|
||||
do_resize=True,
|
||||
size=None,
|
||||
do_center_crop=True,
|
||||
crop_size=None,
|
||||
do_normalize=True,
|
||||
image_mean=OPENAI_CLIP_MEAN,
|
||||
image_std=OPENAI_CLIP_STD,
|
||||
do_convert_rgb=True,
|
||||
temporal_patch_size=2,
|
||||
patch_size=14,
|
||||
min_pixels=20 * 20,
|
||||
max_pixels=100 * 100,
|
||||
merge_size=2,
|
||||
):
|
||||
size = size if size is not None else {"shortest_edge": 20}
|
||||
crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.num_frames = num_frames
|
||||
self.num_channels = num_channels
|
||||
self.min_resolution = min_resolution
|
||||
self.max_resolution = max_resolution
|
||||
self.do_resize = do_resize
|
||||
self.size = size
|
||||
self.do_center_crop = do_center_crop
|
||||
self.crop_size = crop_size
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean
|
||||
self.image_std = image_std
|
||||
self.do_convert_rgb = do_convert_rgb
|
||||
self.temporal_patch_size = temporal_patch_size
|
||||
self.patch_size = patch_size
|
||||
self.min_pixels = min_pixels
|
||||
self.max_pixels = max_pixels
|
||||
self.merge_size = merge_size
|
||||
|
||||
def prepare_video_processor_dict(self):
|
||||
return {
|
||||
"do_resize": self.do_resize,
|
||||
"do_center_crop": self.do_center_crop,
|
||||
"crop_size": self.crop_size,
|
||||
"do_normalize": self.do_normalize,
|
||||
"image_mean": self.image_mean,
|
||||
"image_std": self.image_std,
|
||||
"do_convert_rgb": self.do_convert_rgb,
|
||||
"temporal_patch_size": self.temporal_patch_size,
|
||||
"patch_size": self.patch_size,
|
||||
"min_pixels": self.min_pixels,
|
||||
"max_pixels": self.max_pixels,
|
||||
"merge_size": self.merge_size,
|
||||
}
|
||||
|
||||
@require_vision
|
||||
def expected_output_video_shape(self, videos):
|
||||
grid_t = self.num_frames // self.temporal_patch_size
|
||||
hidden_dim = self.num_channels * self.temporal_patch_size * self.patch_size * self.patch_size
|
||||
seq_len = 0
|
||||
for video in videos:
|
||||
if isinstance(video[0], Image.Image):
|
||||
video = np.stack([np.array(frame) for frame in video])
|
||||
height, width = get_image_size(video)
|
||||
resized_height, resized_width = smart_resize(
|
||||
height,
|
||||
width,
|
||||
factor=self.patch_size * self.merge_size,
|
||||
min_pixels=self.min_pixels,
|
||||
max_pixels=self.max_pixels,
|
||||
)
|
||||
grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
|
||||
seq_len += grid_t * grid_h * grid_w
|
||||
return [seq_len, hidden_dim]
|
||||
|
||||
def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
|
||||
videos = prepare_video_inputs(
|
||||
batch_size=self.batch_size,
|
||||
num_frames=self.num_frames,
|
||||
num_channels=self.num_channels,
|
||||
min_resolution=self.min_resolution,
|
||||
max_resolution=self.max_resolution,
|
||||
equal_resolution=equal_resolution,
|
||||
return_tensors=return_tensors,
|
||||
)
|
||||
return videos
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
class Qwen2VLVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
|
||||
fast_video_processing_class = Qwen2VLVideoProcessor if is_torchvision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.video_processor_tester = Qwen2VLVideoProcessingTester(self)
|
||||
|
||||
@property
|
||||
def video_processor_dict(self):
|
||||
return self.video_processor_tester.prepare_video_processor_dict()
|
||||
|
||||
def test_video_processor_properties(self):
|
||||
video_processing = self.fast_video_processing_class(**self.video_processor_dict)
|
||||
self.assertTrue(hasattr(video_processing, "do_resize"))
|
||||
self.assertTrue(hasattr(video_processing, "size"))
|
||||
self.assertTrue(hasattr(video_processing, "do_center_crop"))
|
||||
self.assertTrue(hasattr(video_processing, "center_crop"))
|
||||
self.assertTrue(hasattr(video_processing, "do_normalize"))
|
||||
self.assertTrue(hasattr(video_processing, "image_mean"))
|
||||
self.assertTrue(hasattr(video_processing, "image_std"))
|
||||
self.assertTrue(hasattr(video_processing, "do_convert_rgb"))
|
||||
|
||||
# OVERRIDEN BECAUSE QWEN2_VL HAS SPECIAL OUTPUT SHAPES
|
||||
def test_video_processor_from_dict_with_kwargs(self):
|
||||
for video_processing_class in self.video_processor_list:
|
||||
video_processor = video_processing_class(**self.video_processor_dict)
|
||||
self.assertEqual(video_processor.min_pixels, self.video_processor_tester.min_pixels)
|
||||
self.assertEqual(video_processor.max_pixels, self.video_processor_tester.max_pixels)
|
||||
|
||||
video_processor = video_processing_class.from_dict(
|
||||
self.video_processor_dict, min_pixels=256 * 256, max_pixels=640 * 640
|
||||
)
|
||||
self.assertEqual(video_processor.min_pixels, 256 * 256)
|
||||
self.assertEqual(video_processor.max_pixels, 640 * 640)
|
||||
|
||||
def test_call_pil(self):
|
||||
for video_processing_class in self.video_processor_list:
|
||||
# Initialize video_processing
|
||||
video_processing = video_processing_class(**self.video_processor_dict)
|
||||
video_inputs = self.video_processor_tester.prepare_video_inputs(
|
||||
equal_resolution=False, return_tensors="pil"
|
||||
)
|
||||
|
||||
# Each video is a list of PIL Images
|
||||
for video in video_inputs:
|
||||
self.assertIsInstance(video[0], Image.Image)
|
||||
|
||||
# Test not batched input
|
||||
encoded_videos = video_processing(video_inputs[0], return_tensors="pt")[self.input_name]
|
||||
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
|
||||
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
# Test batched
|
||||
encoded_videos = video_processing(video_inputs, return_tensors="pt")[self.input_name]
|
||||
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
|
||||
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
def test_call_numpy(self):
|
||||
for video_processing_class in self.video_processor_list:
|
||||
# Initialize video_processing
|
||||
video_processing = video_processing_class(**self.video_processor_dict)
|
||||
# create random numpy tensors
|
||||
video_inputs = self.video_processor_tester.prepare_video_inputs(
|
||||
equal_resolution=False, return_tensors="np"
|
||||
)
|
||||
for video in video_inputs:
|
||||
self.assertIsInstance(video, np.ndarray)
|
||||
|
||||
# Test not batched input
|
||||
encoded_videos = video_processing(video_inputs[0], return_tensors="pt")[self.input_name]
|
||||
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
|
||||
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
# Test batched
|
||||
encoded_videos = video_processing(video_inputs, return_tensors="pt")[self.input_name]
|
||||
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
|
||||
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
def test_call_pytorch(self):
|
||||
for video_processing_class in self.video_processor_list:
|
||||
# Initialize video_processing
|
||||
video_processing = video_processing_class(**self.video_processor_dict)
|
||||
# create random PyTorch tensors
|
||||
video_inputs = self.video_processor_tester.prepare_video_inputs(
|
||||
equal_resolution=False, return_tensors="torch"
|
||||
)
|
||||
|
||||
for video in video_inputs:
|
||||
self.assertIsInstance(video, torch.Tensor)
|
||||
|
||||
# Test not batched input
|
||||
encoded_videos = video_processing(video_inputs[0], return_tensors="pt")[self.input_name]
|
||||
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
|
||||
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
# Test batched
|
||||
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
|
||||
encoded_videos = video_processing(video_inputs, return_tensors="pt")[self.input_name]
|
||||
self.assertEqual(
|
||||
list(encoded_videos.shape),
|
||||
expected_output_video_shape,
|
||||
)
|
||||
|
||||
def test_nested_input(self):
|
||||
"""Tests that the processor can work with nested list where each video is a list of arrays"""
|
||||
for video_processing_class in self.video_processor_list:
|
||||
video_processing = video_processing_class(**self.video_processor_dict)
|
||||
video_inputs = self.video_processor_tester.prepare_video_inputs(
|
||||
equal_resolution=False, return_tensors="np"
|
||||
)
|
||||
|
||||
# Test not batched input
|
||||
video_inputs_nested = [list(video) for video in video_inputs]
|
||||
encoded_videos = video_processing(video_inputs_nested[0], return_tensors="pt")[self.input_name]
|
||||
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
|
||||
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
# Test batched
|
||||
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
|
||||
encoded_videos = video_processing(video_inputs_nested, return_tensors="pt")[self.input_name]
|
||||
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
@unittest.skip("Skip for now, the test needs adjustment fo Qwen2VL")
|
||||
def test_call_numpy_4_channels(self):
|
||||
for video_processing_class in self.video_processor_list:
|
||||
# Test that can process videos which have an arbitrary number of channels
|
||||
# Initialize video_processing
|
||||
video_processor = video_processing_class(**self.video_processor_dict)
|
||||
|
||||
# create random numpy tensors
|
||||
self.video_processor_tester.num_channels = 4
|
||||
video_inputs = self.video_processor_tester.prepare_video_inputs(
|
||||
equal_resolution=False, return_tensors="np"
|
||||
)
|
||||
|
||||
# Test not batched input
|
||||
encoded_videos = video_processor(
|
||||
video_inputs[0],
|
||||
return_tensors="pt",
|
||||
input_data_format="channels_last",
|
||||
image_mean=0,
|
||||
image_std=1,
|
||||
)[self.input_name]
|
||||
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
|
||||
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
# Test batched
|
||||
encoded_videos = video_processor(
|
||||
video_inputs,
|
||||
return_tensors="pt",
|
||||
input_data_format="channels_last",
|
||||
image_mean=0,
|
||||
image_std=1,
|
||||
)[self.input_name]
|
||||
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
|
||||
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
|
||||
@@ -22,7 +22,7 @@ import requests
|
||||
|
||||
from transformers import SmolVLMProcessor
|
||||
from transformers.models.auto.processing_auto import AutoProcessor
|
||||
from transformers.testing_utils import require_av, require_torch, require_vision
|
||||
from transformers.testing_utils import is_flaky, require_av, require_torch, require_vision
|
||||
from transformers.utils import is_vision_available
|
||||
|
||||
from ...test_processing_common import ProcessorTesterMixin
|
||||
@@ -63,6 +63,7 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
)
|
||||
cls.bos_token = processor.tokenizer.bos_token
|
||||
cls.image_token = processor.image_token
|
||||
cls.video_token = processor.image_token * 8 # SmolVLM uses image token and repeats it `num_frames` times
|
||||
cls.fake_image_token = processor.fake_image_token
|
||||
cls.global_img_token = processor.global_image_token
|
||||
|
||||
@@ -79,6 +80,9 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def get_image_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
|
||||
|
||||
def get_video_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor
|
||||
|
||||
def get_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
@@ -114,6 +118,10 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def tearDownClass(cls):
|
||||
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
||||
|
||||
@is_flaky # fails 15 out of 100, FIXME @raushan
|
||||
def test_structured_kwargs_nested_from_dict_video(self):
|
||||
super().test_structured_kwargs_nested_from_dict_video()
|
||||
|
||||
def test_process_interleaved_images_prompts_no_image_splitting(self):
|
||||
processor_components = self.prepare_components()
|
||||
processor_components["tokenizer"] = self.get_component("tokenizer", padding_side="left")
|
||||
@@ -433,10 +441,13 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
video_processor = self.get_component("video_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor_kwargs = self.prepare_processor_dict()
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor, **processor_kwargs)
|
||||
processor = self.processor_class(
|
||||
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor, **processor_kwargs
|
||||
)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs(batch_size=2, modality="image")
|
||||
@@ -556,3 +567,7 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
padding=True,
|
||||
max_length=20,
|
||||
)
|
||||
|
||||
@unittest.skip("SmolVLM cannot accept image URL as video frames, because it needs to know video fps and duration")
|
||||
def test_apply_chat_template_video_1(self):
|
||||
pass
|
||||
|
||||
118
tests/models/smolvlm/test_video_processing_smolvlm.py
Normal file
118
tests/models/smolvlm/test_video_processing_smolvlm.py
Normal file
@@ -0,0 +1,118 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from transformers.image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
||||
|
||||
from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_vision_available():
|
||||
if is_torchvision_available():
|
||||
from transformers import SmolVLMVideoProcessor
|
||||
from transformers.models.smolvlm.video_processing_smolvlm import get_resize_output_image_size
|
||||
|
||||
|
||||
class SmolVLMVideoProcessingTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=5,
|
||||
num_frames=8,
|
||||
num_channels=3,
|
||||
min_resolution=30,
|
||||
max_resolution=80,
|
||||
do_resize=True,
|
||||
size=None,
|
||||
do_normalize=True,
|
||||
image_mean=IMAGENET_STANDARD_MEAN,
|
||||
image_std=IMAGENET_STANDARD_STD,
|
||||
do_convert_rgb=True,
|
||||
):
|
||||
size = size if size is not None else {"longest_edge": 20}
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.num_frames = num_frames
|
||||
self.num_channels = num_channels
|
||||
self.min_resolution = min_resolution
|
||||
self.max_resolution = max_resolution
|
||||
self.do_resize = do_resize
|
||||
self.size = size
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean
|
||||
self.image_std = image_std
|
||||
self.do_convert_rgb = do_convert_rgb
|
||||
|
||||
def prepare_video_processor_dict(self):
|
||||
return {
|
||||
"do_resize": self.do_resize,
|
||||
"size": self.size,
|
||||
"do_normalize": self.do_normalize,
|
||||
"image_mean": self.image_mean,
|
||||
"image_std": self.image_std,
|
||||
"do_convert_rgb": self.do_convert_rgb,
|
||||
}
|
||||
|
||||
def expected_output_video_shape(self, videos):
|
||||
max_height, max_width = 0, 0
|
||||
if not isinstance(videos[0], torch.Tensor):
|
||||
videos = [torch.tensor(np.array(video)).permute(0, -1, -3, -2) for video in videos]
|
||||
for video in videos:
|
||||
height, width = get_resize_output_image_size(video, self.size["longest_edge"])
|
||||
max_height = max(height, max_height)
|
||||
max_width = max(width, max_width)
|
||||
return [self.num_frames, self.num_channels, max_height, max_width]
|
||||
|
||||
def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
|
||||
videos = prepare_video_inputs(
|
||||
batch_size=self.batch_size,
|
||||
num_frames=self.num_frames,
|
||||
num_channels=self.num_channels,
|
||||
min_resolution=self.min_resolution,
|
||||
max_resolution=self.max_resolution,
|
||||
equal_resolution=equal_resolution,
|
||||
return_tensors=return_tensors,
|
||||
)
|
||||
return videos
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
class SmolVLMVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
|
||||
fast_video_processing_class = SmolVLMVideoProcessor if is_torchvision_available() else None
|
||||
input_name = "pixel_values"
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.video_processor_tester = SmolVLMVideoProcessingTester(self)
|
||||
|
||||
@property
|
||||
def video_processor_dict(self):
|
||||
return self.video_processor_tester.prepare_video_processor_dict()
|
||||
|
||||
def test_video_processor_from_dict_with_kwargs(self):
|
||||
video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict)
|
||||
self.assertEqual(video_processor.size, {"longest_edge": 20})
|
||||
|
||||
video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict, size=42)
|
||||
self.assertEqual(video_processor.size, {"height": 42, "width": 42})
|
||||
@@ -1,327 +0,0 @@
|
||||
# Copyright 2024 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
|
||||
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import VideoLlavaImageProcessor
|
||||
|
||||
|
||||
class VideoLlavaImageProcessingTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=5,
|
||||
num_channels=3,
|
||||
image_size=18,
|
||||
min_resolution=30,
|
||||
max_resolution=80,
|
||||
do_resize=True,
|
||||
size=None,
|
||||
do_center_crop=True,
|
||||
crop_size=None,
|
||||
do_normalize=True,
|
||||
image_mean=OPENAI_CLIP_MEAN,
|
||||
image_std=OPENAI_CLIP_STD,
|
||||
do_convert_rgb=True,
|
||||
):
|
||||
size = size if size is not None else {"shortest_edge": 20}
|
||||
crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.num_channels = num_channels
|
||||
self.image_size = image_size
|
||||
self.min_resolution = min_resolution
|
||||
self.max_resolution = max_resolution
|
||||
self.do_resize = do_resize
|
||||
self.size = size
|
||||
self.do_center_crop = do_center_crop
|
||||
self.crop_size = crop_size
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean
|
||||
self.image_std = image_std
|
||||
self.do_convert_rgb = do_convert_rgb
|
||||
|
||||
def prepare_image_processor_dict(self):
|
||||
return {
|
||||
"do_resize": self.do_resize,
|
||||
"size": self.size,
|
||||
"do_center_crop": self.do_center_crop,
|
||||
"crop_size": self.crop_size,
|
||||
"do_normalize": self.do_normalize,
|
||||
"image_mean": self.image_mean,
|
||||
"image_std": self.image_std,
|
||||
"do_convert_rgb": self.do_convert_rgb,
|
||||
}
|
||||
|
||||
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.expected_output_image_shape
|
||||
def expected_output_image_shape(self, images):
|
||||
return self.num_channels, self.crop_size["height"], self.crop_size["width"]
|
||||
|
||||
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.prepare_image_inputs
|
||||
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
|
||||
return prepare_image_inputs(
|
||||
batch_size=self.batch_size,
|
||||
num_channels=self.num_channels,
|
||||
min_resolution=self.min_resolution,
|
||||
max_resolution=self.max_resolution,
|
||||
equal_resolution=equal_resolution,
|
||||
numpify=numpify,
|
||||
torchify=torchify,
|
||||
)
|
||||
|
||||
def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False):
|
||||
images = prepare_image_inputs(
|
||||
batch_size=self.batch_size,
|
||||
num_channels=self.num_channels,
|
||||
min_resolution=self.min_resolution,
|
||||
max_resolution=self.max_resolution,
|
||||
equal_resolution=equal_resolution,
|
||||
numpify=numpify,
|
||||
torchify=torchify,
|
||||
)
|
||||
# let's simply copy the frames to fake a long video-clip
|
||||
if numpify or torchify:
|
||||
videos = []
|
||||
for image in images:
|
||||
if numpify:
|
||||
video = image[None, ...].repeat(8, 0)
|
||||
else:
|
||||
video = image[None, ...].repeat(8, 1, 1, 1)
|
||||
videos.append(video)
|
||||
else:
|
||||
videos = []
|
||||
for pil_image in images:
|
||||
videos.append([pil_image] * 8)
|
||||
|
||||
return videos
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
class VideoLlavaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = VideoLlavaImageProcessor if is_vision_available() else None
|
||||
|
||||
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->VideoLlava
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = VideoLlavaImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.image_processor_dict
|
||||
def image_processor_dict(self):
|
||||
return self.image_processor_tester.prepare_image_processor_dict()
|
||||
|
||||
def test_image_processor_properties(self):
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
self.assertTrue(hasattr(image_processing, "do_resize"))
|
||||
self.assertTrue(hasattr(image_processing, "size"))
|
||||
self.assertTrue(hasattr(image_processing, "do_center_crop"))
|
||||
self.assertTrue(hasattr(image_processing, "center_crop"))
|
||||
self.assertTrue(hasattr(image_processing, "do_normalize"))
|
||||
self.assertTrue(hasattr(image_processing, "image_mean"))
|
||||
self.assertTrue(hasattr(image_processing, "image_std"))
|
||||
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
|
||||
|
||||
# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.test_image_processor_from_dict_with_kwargs
|
||||
def test_image_processor_from_dict_with_kwargs(self):
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processor = image_processing_class.from_dict(self.image_processor_dict)
|
||||
self.assertEqual(image_processor.size, {"shortest_edge": 20})
|
||||
self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
|
||||
|
||||
image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
|
||||
self.assertEqual(image_processor.size, {"shortest_edge": 42})
|
||||
self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
|
||||
|
||||
def test_call_pil(self):
|
||||
# Initialize image_processing
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
# create random PIL images
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
|
||||
for image in image_inputs:
|
||||
self.assertIsInstance(image, Image.Image)
|
||||
|
||||
# Test not batched input
|
||||
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values_images
|
||||
expected_output_image_shape = (1, 3, 18, 18)
|
||||
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
|
||||
|
||||
# Test batched
|
||||
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values_images
|
||||
expected_output_image_shape = (5, 3, 18, 18)
|
||||
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
|
||||
|
||||
def test_call_numpy(self):
|
||||
# Initialize image_processing
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
# create random numpy tensors
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)
|
||||
for image in image_inputs:
|
||||
self.assertIsInstance(image, np.ndarray)
|
||||
|
||||
# Test not batched input
|
||||
encoded_images = image_processing(images=image_inputs[0], return_tensors="pt").pixel_values_images
|
||||
expected_output_image_shape = (1, 3, 18, 18)
|
||||
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
|
||||
|
||||
# Test batched
|
||||
encoded_images = image_processing(images=image_inputs, return_tensors="pt").pixel_values_images
|
||||
expected_output_image_shape = (5, 3, 18, 18)
|
||||
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
|
||||
|
||||
def test_call_numpy_videos(self):
|
||||
# Initialize image_processing
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
# create random numpy tensors
|
||||
video_inputs = self.image_processor_tester.prepare_video_inputs(numpify=True, equal_resolution=True)
|
||||
for video in video_inputs:
|
||||
self.assertIsInstance(video, np.ndarray)
|
||||
|
||||
# Test not batched input
|
||||
encoded_videos = image_processing(images=None, videos=video_inputs[0], return_tensors="pt").pixel_values_videos
|
||||
expected_output_video_shape = (1, 8, 3, 18, 18)
|
||||
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
# Test batched
|
||||
encoded_videos = image_processing(images=None, videos=video_inputs, return_tensors="pt").pixel_values_videos
|
||||
expected_output_video_shape = (5, 8, 3, 18, 18)
|
||||
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
def test_call_pil_videos(self):
|
||||
# Initialize image_processing
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
# the inputs come in list of lists batched format
|
||||
video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True)
|
||||
for video in video_inputs:
|
||||
self.assertIsInstance(video[0], Image.Image)
|
||||
|
||||
# Test not batched input
|
||||
encoded_videos = image_processing(images=None, videos=video_inputs[0], return_tensors="pt").pixel_values_videos
|
||||
expected_output_video_shape = (1, 8, 3, 18, 18)
|
||||
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
# Test batched
|
||||
encoded_videos = image_processing(images=None, videos=video_inputs, return_tensors="pt").pixel_values_videos
|
||||
expected_output_video_shape = (5, 8, 3, 18, 18)
|
||||
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
def test_call_pytorch(self):
|
||||
# Initialize image_processing
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
# create random PyTorch tensors
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
|
||||
|
||||
for image in image_inputs:
|
||||
self.assertIsInstance(image, torch.Tensor)
|
||||
|
||||
# Test not batched input
|
||||
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values_images
|
||||
expected_output_image_shape = (1, 3, 18, 18)
|
||||
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
|
||||
|
||||
# Test batched
|
||||
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values_images
|
||||
expected_output_image_shape = (5, 3, 18, 18)
|
||||
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
|
||||
|
||||
def test_call_pytorch_videos(self):
|
||||
# Initialize image_processing
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
# create random PyTorch tensors
|
||||
video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True, torchify=True)
|
||||
for video in video_inputs:
|
||||
self.assertIsInstance(video, torch.Tensor)
|
||||
|
||||
# Test not batched input
|
||||
encoded_videos = image_processing(images=None, videos=video_inputs[0], return_tensors="pt").pixel_values_videos
|
||||
expected_output_video_shape = (1, 8, 3, 18, 18)
|
||||
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
# Test batched
|
||||
encoded_videos = image_processing(images=None, videos=video_inputs, return_tensors="pt").pixel_values_videos
|
||||
expected_output_video_shape = (5, 8, 3, 18, 18)
|
||||
self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape)
|
||||
|
||||
@parameterized.expand([(True, False), (False, True)])
|
||||
def test_call_mixed(self, numpify, torchify):
|
||||
# Initialize image_processing
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
# create random numpy tensors
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(
|
||||
equal_resolution=True, numpify=numpify, torchify=torchify
|
||||
)
|
||||
video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True, torchify=torchify)
|
||||
|
||||
# Test not batched input
|
||||
encoded = image_processing(images=image_inputs[0], videos=video_inputs[0], return_tensors="pt")
|
||||
expected_output_video_shape = (1, 8, 3, 18, 18)
|
||||
expected_output_image_shape = (1, 3, 18, 18)
|
||||
self.assertEqual(tuple(encoded.pixel_values_videos.shape), expected_output_video_shape)
|
||||
self.assertEqual(tuple(encoded.pixel_values_images.shape), expected_output_image_shape)
|
||||
|
||||
# Test batched
|
||||
encoded = image_processing(images=image_inputs, videos=video_inputs, return_tensors="pt")
|
||||
expected_output_video_shape = (5, 8, 3, 18, 18)
|
||||
expected_output_image_shape = (5, 3, 18, 18)
|
||||
self.assertEqual(tuple(encoded.pixel_values_videos.shape), expected_output_video_shape)
|
||||
self.assertEqual(tuple(encoded.pixel_values_images.shape), expected_output_image_shape)
|
||||
|
||||
def test_call_numpy_4_channels(self):
|
||||
# Test that can process images which have an arbitrary number of channels
|
||||
# Initialize image_processing
|
||||
image_processor = self.image_processing_class(**self.image_processor_dict)
|
||||
|
||||
# create random numpy tensors
|
||||
self.image_processor_tester.num_channels = 4
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
|
||||
|
||||
# Test not batched input
|
||||
encoded_images = image_processor(
|
||||
image_inputs[0],
|
||||
return_tensors="pt",
|
||||
input_data_format="channels_last",
|
||||
image_mean=0,
|
||||
image_std=1,
|
||||
).pixel_values_images
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
|
||||
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
|
||||
|
||||
# Test batched
|
||||
encoded_images = image_processor(
|
||||
image_inputs,
|
||||
return_tensors="pt",
|
||||
input_data_format="channels_last",
|
||||
image_mean=0,
|
||||
image_std=1,
|
||||
).pixel_values_images
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
|
||||
self.assertEqual(
|
||||
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
|
||||
)
|
||||
122
tests/models/video_llava/test_video_processing_video_llava.py
Normal file
122
tests/models/video_llava/test_video_processing_video_llava.py
Normal file
@@ -0,0 +1,122 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
||||
|
||||
from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
pass
|
||||
|
||||
if is_vision_available():
|
||||
if is_torchvision_available():
|
||||
from transformers import VideoLlavaVideoProcessor
|
||||
|
||||
|
||||
class VideoLlavaVideoProcessingTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=5,
|
||||
num_frames=8,
|
||||
num_channels=3,
|
||||
image_size=18,
|
||||
min_resolution=30,
|
||||
max_resolution=80,
|
||||
do_resize=True,
|
||||
size=None,
|
||||
do_center_crop=True,
|
||||
crop_size=None,
|
||||
do_normalize=True,
|
||||
image_mean=OPENAI_CLIP_MEAN,
|
||||
image_std=OPENAI_CLIP_STD,
|
||||
do_convert_rgb=True,
|
||||
):
|
||||
super().__init__()
|
||||
size = size if size is not None else {"shortest_edge": 20}
|
||||
crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.num_frames = num_frames
|
||||
self.num_channels = num_channels
|
||||
self.image_size = image_size
|
||||
self.min_resolution = min_resolution
|
||||
self.max_resolution = max_resolution
|
||||
self.do_resize = do_resize
|
||||
self.size = size
|
||||
self.do_center_crop = do_center_crop
|
||||
self.crop_size = crop_size
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean
|
||||
self.image_std = image_std
|
||||
self.do_convert_rgb = do_convert_rgb
|
||||
|
||||
def prepare_video_processor_dict(self):
|
||||
return {
|
||||
"do_resize": self.do_resize,
|
||||
"size": self.size,
|
||||
"do_center_crop": self.do_center_crop,
|
||||
"crop_size": self.crop_size,
|
||||
"do_normalize": self.do_normalize,
|
||||
"image_mean": self.image_mean,
|
||||
"image_std": self.image_std,
|
||||
"do_convert_rgb": self.do_convert_rgb,
|
||||
}
|
||||
|
||||
def expected_output_video_shape(self, images):
|
||||
return self.num_frames, self.num_channels, self.crop_size["height"], self.crop_size["width"]
|
||||
|
||||
def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
|
||||
videos = prepare_video_inputs(
|
||||
batch_size=self.batch_size,
|
||||
num_frames=self.num_frames,
|
||||
num_channels=self.num_channels,
|
||||
min_resolution=self.min_resolution,
|
||||
max_resolution=self.max_resolution,
|
||||
equal_resolution=equal_resolution,
|
||||
return_tensors=return_tensors,
|
||||
)
|
||||
|
||||
return videos
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
class VideoLlavaVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
|
||||
fast_video_processing_class = VideoLlavaVideoProcessor if is_torchvision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.video_processor_tester = VideoLlavaVideoProcessingTester(self)
|
||||
|
||||
@property
|
||||
def video_processor_dict(self):
|
||||
return self.video_processor_tester.prepare_video_processor_dict()
|
||||
|
||||
def test_video_processor_properties(self):
|
||||
video_processing = self.fast_video_processing_class(**self.video_processor_dict)
|
||||
self.assertTrue(hasattr(video_processing, "do_resize"))
|
||||
self.assertTrue(hasattr(video_processing, "size"))
|
||||
self.assertTrue(hasattr(video_processing, "do_center_crop"))
|
||||
self.assertTrue(hasattr(video_processing, "center_crop"))
|
||||
self.assertTrue(hasattr(video_processing, "do_normalize"))
|
||||
self.assertTrue(hasattr(video_processing, "image_mean"))
|
||||
self.assertTrue(hasattr(video_processing, "image_std"))
|
||||
self.assertTrue(hasattr(video_processing, "do_convert_rgb"))
|
||||
Reference in New Issue
Block a user