Qwen2-VL: clean-up and add more tests (#33354)

* clean-up on qwen2-vl and add generation tests

* add video tests

* Update tests/models/qwen2_vl/test_processing_qwen2_vl.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* fix and add better tests

* Update src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* update docs and address comments

* Update docs/source/en/model_doc/qwen2_vl.md

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update docs/source/en/model_doc/qwen2_vl.md

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* update

* remove size at all

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
This commit is contained in:
Raushan Turganbay
2024-09-12 18:24:04 +02:00
committed by GitHub
parent 8f8af0fb38
commit 2f611d30d9
6 changed files with 297 additions and 106 deletions

View File

@@ -65,6 +65,7 @@ class Qwen2VLVisionText2TextModelTester:
image_size=28,
bos_token_id=0,
eos_token_id=1,
pad_token_id=2,
vision_start_token_id=151652,
image_token_id=151655,
video_token_id=151656,
@@ -76,7 +77,7 @@ class Qwen2VLVisionText2TextModelTester:
max_window_layers=3,
model_type="qwen2_vl",
num_attention_heads=4,
num_hidden_layers=3,
num_hidden_layers=4,
num_key_value_heads=2,
rope_theta=10000,
tie_word_embeddings=True,
@@ -98,6 +99,7 @@ class Qwen2VLVisionText2TextModelTester:
self.ignore_index = ignore_index
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.pad_token_id = pad_token_id
self.vision_start_token_id = vision_start_token_id
self.image_token_id = image_token_id
self.video_token_id = video_token_id
@@ -137,6 +139,7 @@ class Qwen2VLVisionText2TextModelTester:
tie_word_embeddings=self.tie_word_embeddings,
bos_token_id=self.bos_token_id,
eos_token_id=self.eos_token_id,
pad_token_id=self.pad_token_id,
vision_start_token_id=self.vision_start_token_id,
image_token_id=self.image_token_id,
video_token_id=self.video_token_id,
@@ -162,6 +165,8 @@ class Qwen2VLVisionText2TextModelTester:
vision_seqlen = pixel_values.shape[0] // self.batch_size // (self.vision_config["spatial_merge_size"] ** 2)
input_ids = ids_tensor([self.batch_size, self.seq_length - 1 + vision_seqlen], self.vocab_size)
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
input_ids[input_ids == self.image_token_id] = self.pad_token_id
input_ids[:, torch.arange(vision_seqlen, device=torch_device) + 1] = self.image_token_id
labels = torch.zeros(
(self.batch_size, self.seq_length - 1 + vision_seqlen),
@@ -221,6 +226,7 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
"""
all_model_classes = (Qwen2VLForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (Qwen2VLForConditionalGeneration,) if is_torch_available() else ()
test_pruning = False
test_head_masking = False
@@ -300,6 +306,12 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
def test_model_is_small(self):
pass
@unittest.skip(
reason="Qwen2-VL can't do low-memory generation because position IDs have extra dimension and split function doesn't work for that"
)
def test_beam_search_low_memory(self):
pass
@require_torch
class Qwen2VLIntegrationTest(unittest.TestCase):

View File

@@ -0,0 +1,237 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import shutil
import tempfile
import unittest
import pytest
from transformers import AutoProcessor, Qwen2Tokenizer
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_vision_available
from ...test_processing_common import ProcessorTesterMixin
if is_vision_available():
from transformers import Qwen2VLImageProcessor, Qwen2VLProcessor
@require_vision
@require_torch
class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = Qwen2VLProcessor
def setUp(self):
self.tmpdirname = tempfile.mkdtemp()
processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", patch_size=4)
processor.save_pretrained(self.tmpdirname)
def get_tokenizer(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
def get_image_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def test_save_load_pretrained_default(self):
tokenizer = self.get_tokenizer()
image_processor = self.get_image_processor()
processor = Qwen2VLProcessor(tokenizer=tokenizer, image_processor=image_processor)
processor.save_pretrained(self.tmpdirname)
processor = Qwen2VLProcessor.from_pretrained(self.tmpdirname, use_fast=False)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
self.assertIsInstance(processor.tokenizer, Qwen2Tokenizer)
self.assertIsInstance(processor.image_processor, Qwen2VLImageProcessor)
def test_image_processor(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = Qwen2VLProcessor(tokenizer=tokenizer, image_processor=image_processor)
image_input = self.prepare_image_inputs()
input_image_proc = image_processor(image_input, return_tensors="np")
input_processor = processor(images=image_input, text="dummy", return_tensors="np")
for key in input_image_proc.keys():
self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
def test_processor(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = Qwen2VLProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input)
self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask", "pixel_values", "image_grid_thw"])
# test if it raises when no input is passed
with pytest.raises(ValueError):
processor()
# test if it raises when no text is passed
with pytest.raises(TypeError):
processor(images=image_input)
def test_model_input_names(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = Qwen2VLProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
video_inputs = self.prepare_video_inputs()
inputs = processor(text=input_str, images=image_input, videos=video_inputs)
self.assertListEqual(list(inputs.keys()), processor.model_input_names)
# Qwen2-VL doesn't accept `size` and resized to an optimal size using image_processor attrbutes
# defined at `init`. Therefore, all tests are overwritten and don't actually test if kwargs are passed
# to image processors
def test_image_processor_defaults_preserved_by_image_kwargs(self):
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input)
self.assertEqual(inputs["pixel_values"].shape[0], 800)
def test_kwargs_overrides_default_image_processor_kwargs(self):
image_processor = self.get_component(
"image_processor",
)
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input)
self.assertEqual(inputs["pixel_values"].shape[0], 800)
def test_unstructured_kwargs(self):
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
inputs = processor(
text=input_str,
images=image_input,
return_tensors="pt",
padding="max_length",
max_length=76,
)
self.assertEqual(inputs["pixel_values"].shape[0], 800)
self.assertEqual(len(inputs["input_ids"][0]), 76)
def test_unstructured_kwargs_batched(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = ["lower newer", "upper older longer string"]
image_input = self.prepare_image_inputs() * 2
inputs = processor(
text=input_str,
images=image_input,
return_tensors="pt",
padding="longest",
max_length=76,
)
self.assertEqual(inputs["pixel_values"].shape[0], 1600)
self.assertEqual(len(inputs["input_ids"][0]), 4)
def test_structured_kwargs_nested(self):
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
# Define the kwargs for each modality
all_kwargs = {
"common_kwargs": {"return_tensors": "pt"},
"text_kwargs": {"padding": "max_length", "max_length": 76},
}
inputs = processor(text=input_str, images=image_input, **all_kwargs)
self.skip_processor_without_typed_kwargs(processor)
self.assertEqual(inputs["pixel_values"].shape[0], 800)
self.assertEqual(len(inputs["input_ids"][0]), 76)
def test_structured_kwargs_nested_from_dict(self):
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
# Define the kwargs for each modality
all_kwargs = {
"common_kwargs": {"return_tensors": "pt"},
"text_kwargs": {"padding": "max_length", "max_length": 76},
}
inputs = processor(text=input_str, images=image_input, **all_kwargs)
self.assertEqual(inputs["pixel_values"].shape[0], 800)
self.assertEqual(len(inputs["input_ids"][0]), 76)
def test_image_processor_defaults_preserved_by_video_kwargs(self):
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
video_input = self.prepare_video_inputs()
inputs = processor(text=input_str, videos=video_input)
self.assertEqual(inputs["pixel_values_videos"].shape[0], 9600)

View File

@@ -23,15 +23,12 @@ try:
from typing import Unpack
except ImportError:
from typing_extensions import Unpack
import unittest
import numpy as np
from transformers import CLIPTokenizerFast, ProcessorMixin
from transformers.models.auto.processing_auto import processor_class_from_name
from transformers.testing_utils import (
check_json_file_has_correct_format,
require_tokenizers,
require_torch,
require_vision,
)
@@ -41,8 +38,6 @@ from transformers.utils import is_vision_available
if is_vision_available():
from PIL import Image
from transformers import CLIPImageProcessor
def prepare_image_inputs():
"""This function prepares a list of PIL images"""
@@ -53,7 +48,6 @@ def prepare_image_inputs():
@require_torch
@require_vision
@require_torch
class ProcessorTesterMixin:
processor_class = None
@@ -91,6 +85,13 @@ class ProcessorTesterMixin:
"""This function prepares a list of PIL images for testing"""
return prepare_image_inputs()
@require_vision
def prepare_video_inputs(self):
"""This function prepares a list of numpy videos."""
video_input = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)] * 8
image_inputs = [video_input] * 3 # batch-size=3
return image_inputs
def test_processor_to_json_string(self):
processor = self.get_processor()
obj = json.loads(processor.to_json_string())
@@ -125,8 +126,6 @@ class ProcessorTesterMixin:
if not is_kwargs_typed_dict:
self.skipTest(f"{self.processor_class} doesn't have typed kwargs.")
@require_vision
@require_torch
def test_tokenizer_defaults_preserved_by_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -141,8 +140,6 @@ class ProcessorTesterMixin:
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
self.assertEqual(len(inputs["input_ids"][0]), 117)
@require_torch
@require_vision
def test_image_processor_defaults_preserved_by_image_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -158,8 +155,6 @@ class ProcessorTesterMixin:
inputs = processor(text=input_str, images=image_input)
self.assertEqual(len(inputs["pixel_values"][0][0]), 234)
@require_vision
@require_torch
def test_kwargs_overrides_default_tokenizer_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -176,8 +171,6 @@ class ProcessorTesterMixin:
)
self.assertEqual(len(inputs["input_ids"][0]), 112)
@require_torch
@require_vision
def test_kwargs_overrides_default_image_processor_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -193,8 +186,6 @@ class ProcessorTesterMixin:
inputs = processor(text=input_str, images=image_input, size=[224, 224])
self.assertEqual(len(inputs["pixel_values"][0][0]), 224)
@require_torch
@require_vision
def test_unstructured_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -218,8 +209,6 @@ class ProcessorTesterMixin:
self.assertEqual(inputs["pixel_values"].shape[2], 214)
self.assertEqual(len(inputs["input_ids"][0]), 76)
@require_torch
@require_vision
def test_unstructured_kwargs_batched(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -244,8 +233,6 @@ class ProcessorTesterMixin:
self.assertEqual(len(inputs["input_ids"][0]), 6)
@require_torch
@require_vision
def test_doubly_passed_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -265,8 +252,6 @@ class ProcessorTesterMixin:
size={"height": 214, "width": 214},
)
@require_torch
@require_vision
def test_structured_kwargs_nested(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -293,8 +278,6 @@ class ProcessorTesterMixin:
self.assertEqual(len(inputs["input_ids"][0]), 76)
@require_torch
@require_vision
def test_structured_kwargs_nested_from_dict(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -318,48 +301,3 @@ class ProcessorTesterMixin:
self.assertEqual(inputs["pixel_values"].shape[2], 214)
self.assertEqual(len(inputs["input_ids"][0]), 76)
class MyProcessor(ProcessorMixin):
attributes = ["image_processor", "tokenizer"]
image_processor_class = "CLIPImageProcessor"
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
def __init__(self, image_processor=None, tokenizer=None, processor_attr_1=1, processor_attr_2=True):
super().__init__(image_processor, tokenizer)
self.processor_attr_1 = processor_attr_1
self.processor_attr_2 = processor_attr_2
@require_tokenizers
@require_vision
class ProcessorTest(unittest.TestCase):
processor_class = MyProcessor
def prepare_processor_dict(self):
return {"processor_attr_1": 1, "processor_attr_2": False}
def get_processor(self):
image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
processor = MyProcessor(image_processor, tokenizer, **self.prepare_processor_dict())
return processor
def test_processor_to_json_string(self):
processor = self.get_processor()
obj = json.loads(processor.to_json_string())
for key, value in self.prepare_processor_dict().items():
self.assertEqual(obj[key], value)
self.assertEqual(getattr(processor, key, None), value)
def test_processor_from_and_save_pretrained(self):
processor_first = self.get_processor()
with tempfile.TemporaryDirectory() as tmpdirname:
saved_file = processor_first.save_pretrained(tmpdirname)[0]
check_json_file_has_correct_format(saved_file)
processor_second = self.processor_class.from_pretrained(tmpdirname)
self.assertEqual(processor_second.to_dict(), processor_first.to_dict())