Qwen2-VL: clean-up and add more tests (#33354)

* clean-up on qwen2-vl and add generation tests * add video tests * Update tests/models/qwen2_vl/test_processing_qwen2_vl.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * fix and add better tests * Update src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * update docs and address comments * Update docs/source/en/model_doc/qwen2_vl.md Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update docs/source/en/model_doc/qwen2_vl.md Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * update * remove size at all --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
2024-09-12 18:24:04 +02:00
parent 8f8af0fb38
commit 2f611d30d9
6 changed files with 297 additions and 106 deletions
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -23,15 +23,12 @@ try:
    from typing import Unpack
 except ImportError:
    from typing_extensions import Unpack
-import unittest

 import numpy as np

-from transformers import CLIPTokenizerFast, ProcessorMixin
 from transformers.models.auto.processing_auto import processor_class_from_name
 from transformers.testing_utils import (
    check_json_file_has_correct_format,
-    require_tokenizers,
    require_torch,
    require_vision,
 )
@@ -41,8 +38,6 @@ from transformers.utils import is_vision_available
 if is_vision_available():
    from PIL import Image

-    from transformers import CLIPImageProcessor
-

 def prepare_image_inputs():
    """This function prepares a list of PIL images"""
@@ -53,7 +48,6 @@ def prepare_image_inputs():

@require_torch
@require_vision
-@require_torch
 class ProcessorTesterMixin:
    processor_class = None

@@ -91,6 +85,13 @@ class ProcessorTesterMixin:
        """This function prepares a list of PIL images for testing"""
        return prepare_image_inputs()

+    @require_vision
+    def prepare_video_inputs(self):
+        """This function prepares a list of numpy videos."""
+        video_input = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)] * 8
+        image_inputs = [video_input] * 3  # batch-size=3
+        return image_inputs
+
    def test_processor_to_json_string(self):
        processor = self.get_processor()
        obj = json.loads(processor.to_json_string())
@@ -125,8 +126,6 @@ class ProcessorTesterMixin:
        if not is_kwargs_typed_dict:
            self.skipTest(f"{self.processor_class} doesn't have typed kwargs.")

-    @require_vision
-    @require_torch
    def test_tokenizer_defaults_preserved_by_kwargs(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -141,8 +140,6 @@ class ProcessorTesterMixin:
        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
        self.assertEqual(len(inputs["input_ids"][0]), 117)

-    @require_torch
-    @require_vision
    def test_image_processor_defaults_preserved_by_image_kwargs(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -158,8 +155,6 @@ class ProcessorTesterMixin:
        inputs = processor(text=input_str, images=image_input)
        self.assertEqual(len(inputs["pixel_values"][0][0]), 234)

-    @require_vision
-    @require_torch
    def test_kwargs_overrides_default_tokenizer_kwargs(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -176,8 +171,6 @@ class ProcessorTesterMixin:
        )
        self.assertEqual(len(inputs["input_ids"][0]), 112)

-    @require_torch
-    @require_vision
    def test_kwargs_overrides_default_image_processor_kwargs(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -193,8 +186,6 @@ class ProcessorTesterMixin:
        inputs = processor(text=input_str, images=image_input, size=[224, 224])
        self.assertEqual(len(inputs["pixel_values"][0][0]), 224)

-    @require_torch
-    @require_vision
    def test_unstructured_kwargs(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -218,8 +209,6 @@ class ProcessorTesterMixin:
        self.assertEqual(inputs["pixel_values"].shape[2], 214)
        self.assertEqual(len(inputs["input_ids"][0]), 76)

-    @require_torch
-    @require_vision
    def test_unstructured_kwargs_batched(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -244,8 +233,6 @@ class ProcessorTesterMixin:

        self.assertEqual(len(inputs["input_ids"][0]), 6)

-    @require_torch
-    @require_vision
    def test_doubly_passed_kwargs(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -265,8 +252,6 @@ class ProcessorTesterMixin:
                size={"height": 214, "width": 214},
            )

-    @require_torch
-    @require_vision
    def test_structured_kwargs_nested(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -293,8 +278,6 @@ class ProcessorTesterMixin:

        self.assertEqual(len(inputs["input_ids"][0]), 76)

-    @require_torch
-    @require_vision
    def test_structured_kwargs_nested_from_dict(self):
        if "image_processor" not in self.processor_class.attributes:
            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
@@ -318,48 +301,3 @@ class ProcessorTesterMixin:
        self.assertEqual(inputs["pixel_values"].shape[2], 214)

        self.assertEqual(len(inputs["input_ids"][0]), 76)
-
-
-class MyProcessor(ProcessorMixin):
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "CLIPImageProcessor"
-    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
-
-    def __init__(self, image_processor=None, tokenizer=None, processor_attr_1=1, processor_attr_2=True):
-        super().__init__(image_processor, tokenizer)
-
-        self.processor_attr_1 = processor_attr_1
-        self.processor_attr_2 = processor_attr_2
-
-
-@require_tokenizers
-@require_vision
-class ProcessorTest(unittest.TestCase):
-    processor_class = MyProcessor
-
-    def prepare_processor_dict(self):
-        return {"processor_attr_1": 1, "processor_attr_2": False}
-
-    def get_processor(self):
-        image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
-        tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
-        processor = MyProcessor(image_processor, tokenizer, **self.prepare_processor_dict())
-
-        return processor
-
-    def test_processor_to_json_string(self):
-        processor = self.get_processor()
-        obj = json.loads(processor.to_json_string())
-        for key, value in self.prepare_processor_dict().items():
-            self.assertEqual(obj[key], value)
-            self.assertEqual(getattr(processor, key, None), value)
-
-    def test_processor_from_and_save_pretrained(self):
-        processor_first = self.get_processor()
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            saved_file = processor_first.save_pretrained(tmpdirname)[0]
-            check_json_file_has_correct_format(saved_file)
-            processor_second = self.processor_class.from_pretrained(tmpdirname)
-
-        self.assertEqual(processor_second.to_dict(), processor_first.to_dict())