Qwen2-VL: clean-up and add more tests (#33354)

* clean-up on qwen2-vl and add generation tests

* add video tests

* Update tests/models/qwen2_vl/test_processing_qwen2_vl.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* fix and add better tests

* Update src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* update docs and address comments

* Update docs/source/en/model_doc/qwen2_vl.md

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update docs/source/en/model_doc/qwen2_vl.md

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* update

* remove size at all

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
This commit is contained in:
Raushan Turganbay
2024-09-12 18:24:04 +02:00
committed by GitHub
parent 8f8af0fb38
commit 2f611d30d9
6 changed files with 297 additions and 106 deletions

View File

@@ -23,15 +23,12 @@ try:
from typing import Unpack
except ImportError:
from typing_extensions import Unpack
import unittest
import numpy as np
from transformers import CLIPTokenizerFast, ProcessorMixin
from transformers.models.auto.processing_auto import processor_class_from_name
from transformers.testing_utils import (
check_json_file_has_correct_format,
require_tokenizers,
require_torch,
require_vision,
)
@@ -41,8 +38,6 @@ from transformers.utils import is_vision_available
if is_vision_available():
from PIL import Image
from transformers import CLIPImageProcessor
def prepare_image_inputs():
"""This function prepares a list of PIL images"""
@@ -53,7 +48,6 @@ def prepare_image_inputs():
@require_torch
@require_vision
@require_torch
class ProcessorTesterMixin:
processor_class = None
@@ -91,6 +85,13 @@ class ProcessorTesterMixin:
"""This function prepares a list of PIL images for testing"""
return prepare_image_inputs()
@require_vision
def prepare_video_inputs(self):
"""This function prepares a list of numpy videos."""
video_input = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)] * 8
image_inputs = [video_input] * 3 # batch-size=3
return image_inputs
def test_processor_to_json_string(self):
processor = self.get_processor()
obj = json.loads(processor.to_json_string())
@@ -125,8 +126,6 @@ class ProcessorTesterMixin:
if not is_kwargs_typed_dict:
self.skipTest(f"{self.processor_class} doesn't have typed kwargs.")
@require_vision
@require_torch
def test_tokenizer_defaults_preserved_by_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -141,8 +140,6 @@ class ProcessorTesterMixin:
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
self.assertEqual(len(inputs["input_ids"][0]), 117)
@require_torch
@require_vision
def test_image_processor_defaults_preserved_by_image_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -158,8 +155,6 @@ class ProcessorTesterMixin:
inputs = processor(text=input_str, images=image_input)
self.assertEqual(len(inputs["pixel_values"][0][0]), 234)
@require_vision
@require_torch
def test_kwargs_overrides_default_tokenizer_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -176,8 +171,6 @@ class ProcessorTesterMixin:
)
self.assertEqual(len(inputs["input_ids"][0]), 112)
@require_torch
@require_vision
def test_kwargs_overrides_default_image_processor_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -193,8 +186,6 @@ class ProcessorTesterMixin:
inputs = processor(text=input_str, images=image_input, size=[224, 224])
self.assertEqual(len(inputs["pixel_values"][0][0]), 224)
@require_torch
@require_vision
def test_unstructured_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -218,8 +209,6 @@ class ProcessorTesterMixin:
self.assertEqual(inputs["pixel_values"].shape[2], 214)
self.assertEqual(len(inputs["input_ids"][0]), 76)
@require_torch
@require_vision
def test_unstructured_kwargs_batched(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -244,8 +233,6 @@ class ProcessorTesterMixin:
self.assertEqual(len(inputs["input_ids"][0]), 6)
@require_torch
@require_vision
def test_doubly_passed_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -265,8 +252,6 @@ class ProcessorTesterMixin:
size={"height": 214, "width": 214},
)
@require_torch
@require_vision
def test_structured_kwargs_nested(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -293,8 +278,6 @@ class ProcessorTesterMixin:
self.assertEqual(len(inputs["input_ids"][0]), 76)
@require_torch
@require_vision
def test_structured_kwargs_nested_from_dict(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -318,48 +301,3 @@ class ProcessorTesterMixin:
self.assertEqual(inputs["pixel_values"].shape[2], 214)
self.assertEqual(len(inputs["input_ids"][0]), 76)
class MyProcessor(ProcessorMixin):
attributes = ["image_processor", "tokenizer"]
image_processor_class = "CLIPImageProcessor"
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
def __init__(self, image_processor=None, tokenizer=None, processor_attr_1=1, processor_attr_2=True):
super().__init__(image_processor, tokenizer)
self.processor_attr_1 = processor_attr_1
self.processor_attr_2 = processor_attr_2
@require_tokenizers
@require_vision
class ProcessorTest(unittest.TestCase):
processor_class = MyProcessor
def prepare_processor_dict(self):
return {"processor_attr_1": 1, "processor_attr_2": False}
def get_processor(self):
image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
processor = MyProcessor(image_processor, tokenizer, **self.prepare_processor_dict())
return processor
def test_processor_to_json_string(self):
processor = self.get_processor()
obj = json.loads(processor.to_json_string())
for key, value in self.prepare_processor_dict().items():
self.assertEqual(obj[key], value)
self.assertEqual(getattr(processor, key, None), value)
def test_processor_from_and_save_pretrained(self):
processor_first = self.get_processor()
with tempfile.TemporaryDirectory() as tmpdirname:
saved_file = processor_first.save_pretrained(tmpdirname)[0]
check_json_file_has_correct_format(saved_file)
processor_second = self.processor_class.from_pretrained(tmpdirname)
self.assertEqual(processor_second.to_dict(), processor_first.to_dict())