Uniformize kwargs for image-text-to-text processors (#32544)

* uniformize FUYU processor kwargs

* Uniformize instructblip processor kwargs

* Fix processor kwargs and tests Fuyu, InstructBlip, Kosmos2

* Uniformize llava_next processor

* Fix save_load test for processor with chat_template only as extra init args

* Fix import Unpack

* Fix Fuyu Processor import

* Fix FuyuProcessor import

* Fix FuyuProcessor

* Add defaults for specific kwargs kosmos2

* Fix Udop to return BatchFeature instead of BatchEncoding and uniformize kwargs

* Add tests processor Udop

* remove Copied from in processing Udop as change of input orders caused by BatchEncoding -> BatchFeature

* Fix overwrite tests kwargs processors

* Add warnings and BC for changes in processor inputs order, change docs, add BC for text_pair as arg for Udop

* Fix processing test fuyu

* remove unnecessary pad_token check in instructblip ProcessorTest

* Fix BC tests and cleanup

* FIx imports fuyu

* Uniformize Pix2Struct

* Fix wrong name for FuyuProcessorKwargs

* Fix slow tests reversed inputs align fuyu llava-next, change udop warning

* Fix wrong logging import udop

* Add check images text input order

* Fix copies

* change text pair handling when positional arg

* rebase on main, fix imports in test_processing_common

* remove optional args and udop uniformization from this PR

* fix failing tests

* remove unnecessary test, fix processing utils and test processing common

* cleanup Unpack

* cleanup

* fix conflict grounding dino
This commit is contained in:
Yoni Gozlan
2024-09-24 21:28:19 -04:00
committed by GitHub
parent fa0bb0fe76
commit 5f0c181f4e
24 changed files with 763 additions and 852 deletions

View File

@@ -15,18 +15,15 @@ import shutil
import tempfile
import unittest
import numpy as np
import pytest
from transformers.testing_utils import require_torch, require_vision
from transformers.testing_utils import require_vision
from transformers.utils import is_vision_available
from ...test_processing_common import ProcessorTesterMixin
if is_vision_available():
from PIL import Image
from transformers import (
AutoProcessor,
BertTokenizerFast,
@@ -65,16 +62,6 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
def tearDown(self):
shutil.rmtree(self.tmpdirname)
# Ignore copy
def prepare_image_inputs(self):
"""This function prepares a list of list of PIL images"""
video_inputs = [
[Image.fromarray(np.random.randint(255, size=(30, 400, 3), dtype=np.uint8)) for _ in range(5)]
for _ in range(2)
]
return video_inputs
def test_save_load_pretrained_additional_features(self):
processor = InstructBlipVideoProcessor(
tokenizer=self.get_tokenizer(),
@@ -193,261 +180,3 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
list(inputs.keys()),
["input_ids", "attention_mask", "qformer_input_ids", "qformer_attention_mask", "pixel_values"],
)
# Override as InstructBlipVideoProcessor has qformer_tokenizer
@require_vision
@require_torch
def test_tokenizer_defaults_preserved_by_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
qformer_tokenizer = self.get_component("qformer_tokenizer", max_length=117, padding="max_length")
processor = self.processor_class(
tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
self.assertEqual(len(inputs["input_ids"][0]), 117)
# Override as InstructBlipVideoProcessor has qformer_tokenizer
@require_torch
@require_vision
def test_image_processor_defaults_preserved_by_image_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor", size=(234, 234))
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
qformer_tokenizer = self.get_component("qformer_tokenizer", max_length=117, padding="max_length")
processor = self.processor_class(
tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input)
self.assertEqual(len(inputs["pixel_values"][0][0]), 234)
# Override as InstructBlipVideoProcessor has qformer_tokenizer
@require_vision
@require_torch
def test_kwargs_overrides_default_tokenizer_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer", padding="longest")
qformer_tokenizer = self.get_component("qformer_tokenizer", padding="longest")
processor = self.processor_class(
tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
inputs = processor(
text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
)
self.assertEqual(len(inputs["input_ids"][0]), 112)
# Override as InstructBlipVideoProcessor has qformer_tokenizer
@require_torch
@require_vision
def test_kwargs_overrides_default_image_processor_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor", size=(234, 234))
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
qformer_tokenizer = self.get_component("qformer_tokenizer", max_length=117, padding="max_length")
processor = self.processor_class(
tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input, size=[224, 224])
self.assertEqual(len(inputs["pixel_values"][0][0]), 224)
# Override as InstructBlipVideoProcessor has qformer_tokenizer
@require_torch
@require_vision
def test_unstructured_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
qformer_tokenizer = self.get_component("qformer_tokenizer")
processor = self.processor_class(
tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
inputs = processor(
text=input_str,
images=image_input,
return_tensors="pt",
size={"height": 214, "width": 214},
padding="max_length",
max_length=76,
)
self.assertEqual(inputs["pixel_values"].shape[2], 214)
self.assertEqual(len(inputs["input_ids"][0]), 76)
# Override as InstructBlipVideoProcessor has qformer_tokenizer
@require_torch
@require_vision
def test_unstructured_kwargs_batched(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
qformer_tokenizer = self.get_component("qformer_tokenizer")
processor = self.processor_class(
tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
)
self.skip_processor_without_typed_kwargs(processor)
input_str = ["lower newer", "upper older longer string"]
image_input = self.prepare_image_inputs() * 2
inputs = processor(
text=input_str,
images=image_input,
return_tensors="pt",
size={"height": 214, "width": 214},
padding="longest",
max_length=76,
)
self.assertEqual(inputs["pixel_values"].shape[2], 214)
self.assertEqual(len(inputs["input_ids"][0]), 6)
# Override as InstructBlipVideoProcessor has qformer_tokenizer
@require_torch
@require_vision
def test_doubly_passed_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
qformer_tokenizer = self.get_component("qformer_tokenizer")
processor = self.processor_class(
tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
)
self.skip_processor_without_typed_kwargs(processor)
input_str = ["lower newer"]
image_input = self.prepare_image_inputs()
with self.assertRaises(ValueError):
_ = processor(
text=input_str,
images=image_input,
images_kwargs={"size": {"height": 222, "width": 222}},
size={"height": 214, "width": 214},
)
# Override as InstructBlipVideoProcessor has qformer_tokenizer
@require_torch
@require_vision
def test_structured_kwargs_nested(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
qformer_tokenizer = self.get_component("qformer_tokenizer")
processor = self.processor_class(
tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
# Define the kwargs for each modality
all_kwargs = {
"common_kwargs": {"return_tensors": "pt"},
"images_kwargs": {"size": {"height": 214, "width": 214}},
"text_kwargs": {"padding": "max_length", "max_length": 76},
}
inputs = processor(text=input_str, images=image_input, **all_kwargs)
self.skip_processor_without_typed_kwargs(processor)
self.assertEqual(inputs["pixel_values"].shape[2], 214)
self.assertEqual(len(inputs["input_ids"][0]), 76)
# Override as InstructBlipVideoProcessor has qformer_tokenizer
@require_torch
@require_vision
def test_structured_kwargs_nested_from_dict(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
qformer_tokenizer = self.get_component("qformer_tokenizer")
processor = self.processor_class(
tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
# Define the kwargs for each modality
all_kwargs = {
"common_kwargs": {"return_tensors": "pt"},
"images_kwargs": {"size": {"height": 214, "width": 214}},
"text_kwargs": {"padding": "max_length", "max_length": 76},
}
inputs = processor(text=input_str, images=image_input, **all_kwargs)
self.assertEqual(inputs["pixel_values"].shape[2], 214)
self.assertEqual(len(inputs["input_ids"][0]), 76)
def test_overlapping_text_kwargs_handling(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
processor_kwargs = {}
processor_kwargs["image_processor"] = self.get_component("image_processor")
processor_kwargs["tokenizer"] = tokenizer = self.get_component("tokenizer")
if not tokenizer.pad_token:
tokenizer.pad_token = "[TEST_PAD]"
if "video_processor" in self.processor_class.attributes:
processor_kwargs["video_processor"] = self.get_component("video_processor")
qformer_tokenizer = self.get_component("qformer_tokenizer")
processor = self.processor_class(**processor_kwargs, qformer_tokenizer=qformer_tokenizer)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
with self.assertRaises(ValueError):
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
padding="max_length",
text_kwargs={"padding": "do_not_pad"},
)