Uniformize kwargs for image-text-to-text processors (#32544)

* uniformize FUYU processor kwargs

* Uniformize instructblip processor kwargs

* Fix processor kwargs and tests Fuyu, InstructBlip, Kosmos2

* Uniformize llava_next processor

* Fix save_load test for processor with chat_template only as extra init args

* Fix import Unpack

* Fix Fuyu Processor import

* Fix FuyuProcessor import

* Fix FuyuProcessor

* Add defaults for specific kwargs kosmos2

* Fix Udop to return BatchFeature instead of BatchEncoding and uniformize kwargs

* Add tests processor Udop

* remove Copied from in processing Udop as change of input orders caused by BatchEncoding -> BatchFeature

* Fix overwrite tests kwargs processors

* Add warnings and BC for changes in processor inputs order, change docs, add BC for text_pair as arg for Udop

* Fix processing test fuyu

* remove unnecessary pad_token check in instructblip ProcessorTest

* Fix BC tests and cleanup

* FIx imports fuyu

* Uniformize Pix2Struct

* Fix wrong name for FuyuProcessorKwargs

* Fix slow tests reversed inputs align fuyu llava-next, change udop warning

* Fix wrong logging import udop

* Add check images text input order

* Fix copies

* change text pair handling when positional arg

* rebase on main, fix imports in test_processing_common

* remove optional args and udop uniformization from this PR

* fix failing tests

* remove unnecessary test, fix processing utils and test processing common

* cleanup Unpack

* cleanup

* fix conflict grounding dino
This commit is contained in:
Yoni Gozlan
2024-09-24 21:28:19 -04:00
committed by GitHub
parent fa0bb0fe76
commit 5f0c181f4e
24 changed files with 763 additions and 852 deletions

View File

@@ -330,7 +330,7 @@ class FuyuModelIntegrationTest(unittest.TestCase):
text_prompt_coco_captioning = "Generate a coco-style caption.\n"
inputs = processor(text=text_prompt_coco_captioning, images=image, return_tensors="pt")
inputs = processor(images=image, text=text_prompt_coco_captioning, return_tensors="pt")
generated_ids = model.generate(**inputs, max_new_tokens=10)
# take the last 8 tokens (in order to skip special \n\x04 characters) and decode them

View File

@@ -1,17 +1,25 @@
import io
import tempfile
import unittest
import requests
from transformers import AutoTokenizer, is_torch_available, is_vision_available
from transformers.testing_utils import require_torch, require_torch_gpu, slow
from transformers import (
AutoProcessor,
AutoTokenizer,
FuyuImageProcessor,
FuyuProcessor,
is_torch_available,
is_vision_available,
)
from transformers.testing_utils import require_torch, require_vision
from ...test_processing_common import ProcessorTesterMixin
if is_vision_available():
from PIL import Image
if is_vision_available() and is_torch_available():
from transformers import FuyuImageProcessor, FuyuProcessor
if is_torch_available():
import torch
@@ -20,21 +28,36 @@ if is_torch_available():
@require_torch
@require_torch_gpu
@slow
class FuyuProcessingTest(unittest.TestCase): # TODO Which mixins do we add here?
""" """
@require_vision
class FuyuProcessingTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = FuyuProcessor
def setUp(self):
pretrained_model_name = "adept/fuyu-8b"
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
self.image_processor = FuyuImageProcessor()
self.tmpdirname = tempfile.mkdtemp()
image_processor = FuyuImageProcessor()
tokenizer = AutoTokenizer.from_pretrained("adept/fuyu-8b")
processor = FuyuProcessor(image_processor=image_processor, tokenizer=tokenizer)
processor.save_pretrained(self.tmpdirname)
self.processor = FuyuProcessor(image_processor=self.image_processor, tokenizer=self.tokenizer)
self.text_prompt = "Generate a coco-style caption.\\n"
bus_image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
self.bus_image_pil = Image.open(io.BytesIO(requests.get(bus_image_url).content))
def get_processor(self):
image_processor = FuyuImageProcessor()
tokenizer = AutoTokenizer.from_pretrained("adept/fuyu-8b")
processor = FuyuProcessor(image_processor, tokenizer, **self.prepare_processor_dict())
return processor
def get_tokenizer(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
def get_image_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
def test_fuyu_processing(self):
"""
Test to ensure that the standard processing on a gold example matches adept's code.
@@ -43,7 +66,7 @@ class FuyuProcessingTest(unittest.TestCase): # TODO Which mixins do we add here
EXPECTED_IMAGE_PATCH_INPUTS = torch.Tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, -1, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, -1, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, -1, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, -1, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, -1, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, -1, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, -1, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, -1, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, -1, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, -1, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, -1, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, -1, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, -1, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,]]).to(torch.int64)
EXPECTED_PADDED_UNPACKED_TOKEN_INPUTS = torch.Tensor([[71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 1, 128340, 71374, 71389, 120412, 71377, 71835, 71374, 73615, 71375, 71399, 71435, 71122,]]).to(torch.int64)
one_image_bus_model_inputs = self.processor(text=self.text_prompt, images=self.bus_image_pil)
one_image_bus_model_inputs = self.get_processor()(text=self.text_prompt, images=self.bus_image_pil)
# fmt: on
torch.testing.assert_close(one_image_bus_model_inputs["image_patches_indices"], EXPECTED_IMAGE_PATCH_INPUTS)
@@ -53,8 +76,8 @@ class FuyuProcessingTest(unittest.TestCase): # TODO Which mixins do we add here
"""
Test to check processor works with just text input
"""
processor_outputs = self.processor(text=self.text_prompt)
tokenizer_outputs = self.tokenizer(self.text_prompt)
processor_outputs = self.get_processor()(text=self.text_prompt)
tokenizer_outputs = self.get_tokenizer()(self.text_prompt)
self.assertEqual(processor_outputs["input_ids"], tokenizer_outputs["input_ids"])
def test_fuyu_processing_no_text(self):
@@ -90,7 +113,7 @@ class FuyuProcessingTest(unittest.TestCase): # TODO Which mixins do we add here
]).to(torch.int64)
# fmt: on
processor_outputs = self.processor(images=self.bus_image_pil)
processor_outputs = self.get_processor()(images=self.bus_image_pil)
self.assertTrue((processor_outputs["image_patches_indices"] == EXPECTED_IMAGE_PATCH_INPUTS).all())
def test_fuyu_processing_multiple_image_sample(self):
@@ -107,7 +130,7 @@ class FuyuProcessingTest(unittest.TestCase): # TODO Which mixins do we add here
# Batch of two images - equally sized
images = [self.bus_image_pil, self.bus_image_pil]
processor_outputs = self.processor(text=[self.text_prompt, self.text_prompt], images=images)
processor_outputs = self.get_processor()(text=[self.text_prompt, self.text_prompt], images=images)
self.assertTrue(
(
@@ -124,18 +147,18 @@ class FuyuProcessingTest(unittest.TestCase): # TODO Which mixins do we add here
# Processes single images with different sizes as expected
images = [self.bus_image_pil]
processor_outputs = self.processor(text=self.text_prompt, images=images)
processor_outputs = self.get_processor()(text=self.text_prompt, images=images)
self.assertTrue((processor_outputs["image_patches_indices"] == SINGLE_IMAGE_PATCH_INPUTS).all())
self.assertTrue((processor_outputs["input_ids"] == SINGLE_PADDED_UNPACKED_TOKEN_INPUTS).all())
images = [self.bus_image_pil.resize((64, 300))]
processor_outputs = self.processor(text=self.text_prompt, images=images)
processor_outputs = self.get_processor()(text=self.text_prompt, images=images)
self.assertTrue((processor_outputs["image_patches_indices"] == SINGLE_RESIZED_IMAGE_PATCH_INPUTS).all())
self.assertTrue((processor_outputs["input_ids"] == SINGLE_RESIZED_PADDED_UNPACKED_TOKEN_INPUTS).all())
# Batch of two images - different sizes. Left-pads the smaller image inputs
images = [self.bus_image_pil, self.bus_image_pil.resize((64, 300))]
processor_outputs = self.processor(text=[self.text_prompt, self.text_prompt], images=images)
processor_outputs = self.get_processor()(text=[self.text_prompt, self.text_prompt], images=images)
padding_len_patch = SINGLE_IMAGE_PATCH_INPUTS.shape[1] - SINGLE_RESIZED_IMAGE_PATCH_INPUTS.shape[1]
padded_single_resized_image_patch = torch.cat(
@@ -156,6 +179,155 @@ class FuyuProcessingTest(unittest.TestCase): # TODO Which mixins do we add here
self.assertTrue((processor_outputs["image_patches_indices"] == expected_image_patch_inputs).all())
self.assertTrue((processor_outputs["input_ids"] == expected_padded_unpacked_token_inputs).all())
# Rewrite as Fuyu supports tokenizer kwargs only when image is None.
@require_vision
@require_torch
def test_kwargs_overrides_default_tokenizer_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer", max_length=117)
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
# Fuyu uses tokenizer kwargs only when image is None.
image_input = None
inputs = processor(
text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
)
self.assertEqual(len(inputs["input_ids"][0]), 112)
@unittest.skip("Fuyu processor does not support image_processor kwargs")
def test_image_processor_defaults_preserved_by_image_kwargs(self):
pass
@unittest.skip("Fuyu processor does not support image_processor kwargs")
def test_kwargs_overrides_default_image_processor_kwargs(self):
pass
# Rewrite as Fuyu supports tokenizer kwargs only when image is None.
@require_vision
@require_torch
def test_tokenizer_defaults_preserved_by_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
# Fuyu uses tokenizer kwargs only when image is None.
image_input = None
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
self.assertEqual(len(inputs["input_ids"][0]), 117)
# Rewrite as Fuyu image processor does not return pixel values
@require_torch
@require_vision
def test_structured_kwargs_nested(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
# Fuyu uses tokenizer kwargs only when image is None.
image_input = None
# Define the kwargs for each modality
all_kwargs = {
"common_kwargs": {"return_tensors": "pt"},
"text_kwargs": {"padding": "max_length", "max_length": 76},
}
inputs = processor(text=input_str, images=image_input, **all_kwargs)
self.skip_processor_without_typed_kwargs(processor)
self.assertEqual(len(inputs["input_ids"][0]), 76)
# Rewrite as Fuyu image processor does not return pixel values
@require_torch
@require_vision
def test_structured_kwargs_nested_from_dict(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
# Fuyu uses tokenizer kwargs only when image is None.
image_input = None
# Define the kwargs for each modality
all_kwargs = {
"common_kwargs": {"return_tensors": "pt"},
"text_kwargs": {"padding": "max_length", "max_length": 76},
}
inputs = processor(text=input_str, images=image_input, **all_kwargs)
self.assertEqual(len(inputs["input_ids"][0]), 76)
# Rewrite as Fuyu supports tokenizer kwargs only when image is None.
@require_torch
@require_vision
def test_unstructured_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
# Fuyu uses tokenizer kwargs only when image is None.
image_input = None
inputs = processor(
text=input_str,
images=image_input,
return_tensors="pt",
padding="max_length",
max_length=76,
)
self.assertEqual(len(inputs["input_ids"][0]), 76)
# Rewrite as Fuyu supports tokenizer kwargs only when image is None.
@require_torch
@require_vision
def test_unstructured_kwargs_batched(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = ["lower newer", "upper older longer string"]
# Fuyu uses tokenizer kwargs only when image is None.
image_input = None
inputs = processor(
text=input_str,
images=image_input,
return_tensors="pt",
padding="longest",
max_length=76,
)
self.assertEqual(len(inputs["input_ids"][0]), 6)
@require_torch
class TestImageTextProcessingUtils(unittest.TestCase):