Uniformize kwargs for image-text-to-text processors (#32544)

* uniformize FUYU processor kwargs

* Uniformize instructblip processor kwargs

* Fix processor kwargs and tests Fuyu, InstructBlip, Kosmos2

* Uniformize llava_next processor

* Fix save_load test for processor with chat_template only as extra init args

* Fix import Unpack

* Fix Fuyu Processor import

* Fix FuyuProcessor import

* Fix FuyuProcessor

* Add defaults for specific kwargs kosmos2

* Fix Udop to return BatchFeature instead of BatchEncoding and uniformize kwargs

* Add tests processor Udop

* remove Copied from in processing Udop as change of input orders caused by BatchEncoding -> BatchFeature

* Fix overwrite tests kwargs processors

* Add warnings and BC for changes in processor inputs order, change docs, add BC for text_pair as arg for Udop

* Fix processing test fuyu

* remove unnecessary pad_token check in instructblip ProcessorTest

* Fix BC tests and cleanup

* FIx imports fuyu

* Uniformize Pix2Struct

* Fix wrong name for FuyuProcessorKwargs

* Fix slow tests reversed inputs align fuyu llava-next, change udop warning

* Fix wrong logging import udop

* Add check images text input order

* Fix copies

* change text pair handling when positional arg

* rebase on main, fix imports in test_processing_common

* remove optional args and udop uniformization from this PR

* fix failing tests

* remove unnecessary test, fix processing utils and test processing common

* cleanup Unpack

* cleanup

* fix conflict grounding dino
This commit is contained in:
Yoni Gozlan
2024-09-24 21:28:19 -04:00
committed by GitHub
parent fa0bb0fe76
commit 5f0c181f4e
24 changed files with 763 additions and 852 deletions

View File

@@ -338,7 +338,7 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
load_in_4bit=True,
)
inputs = self.processor(self.prompt, self.image, return_tensors="pt")
inputs = self.processor(images=self.image, text=self.prompt, return_tensors="pt")
# verify inputs against original implementation
filepath = hf_hub_download(
@@ -390,8 +390,8 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
cats_image = Image.open(requests.get(url, stream=True).raw)
inputs = self.processor(
[self.prompt, self.prompt],
images=[self.image, cats_image],
text=[self.prompt, self.prompt],
return_tensors="pt",
padding=True,
).to(torch_device)
@@ -415,7 +415,7 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
)
prompt_with_unk = "[INST] <image>\nWhat is shown in this <unk> image? [/INST]"
inputs = self.processor(prompt_with_unk, self.image, return_tensors="pt")
inputs = self.processor(images=self.image, text=prompt_with_unk, return_tensors="pt")
# verify single forward pass
inputs = inputs.to(torch_device)
@@ -445,7 +445,7 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
lowres_img = Image.open(requests.get(lowres_url, stream=True).raw)
inputs = self.processor(
[self.prompt, self.prompt], images=[lowres_img, cats_image], return_tensors="pt", padding=True
images=[lowres_img, cats_image], text=[self.prompt, self.prompt], return_tensors="pt", padding=True
).to(torch_device)
pixel_values = inputs["pixel_values"]
@@ -498,10 +498,10 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
lowres_img = Image.open(requests.get(lowres_url, stream=True).raw)
inputs_batched = self.processor(
[self.prompt, self.prompt], images=[lowres_img, cats_image], return_tensors="pt", padding=True
images=[lowres_img, cats_image], text=[self.prompt, self.prompt], return_tensors="pt", padding=True
).to(torch_device)
inputs_single = self.processor(self.prompt, images=lowres_img, return_tensors="pt", padding=True).to(
inputs_single = self.processor(images=lowres_img, text=self.prompt, return_tensors="pt", padding=True).to(
torch_device
)
@@ -527,7 +527,7 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
lowres_img = Image.open(requests.get(lowres_url, stream=True).raw)
inputs_batched = self.processor(
[self.prompt, self.prompt], images=[lowres_img, cats_image], return_tensors="pt", padding=True
images=[lowres_img, cats_image], text=[self.prompt, self.prompt], return_tensors="pt", padding=True
).to(torch_device)
# model is in eval mode by default so we should get pad on the left side
@@ -607,13 +607,13 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
# check processing with expansion of inputs
processor.vision_feature_select_strategy = "default"
processor.patch_size = 14
inputs_expanded = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
inputs_expanded = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2356)
# check processing without expansion of inputs (legacy behavior)
processor.vision_feature_select_strategy = None
processor.patch_size = None
inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
self.assertTrue(inputs.input_ids.shape[-1] == 17)
# generate exactly 20 tokens

View File

@@ -18,7 +18,9 @@ import unittest
import torch
from transformers import AutoProcessor, LlamaTokenizerFast, LlavaNextProcessor
from transformers.testing_utils import require_vision
from transformers.testing_utils import (
require_vision,
)
from transformers.utils import is_vision_available
from ...test_processing_common import ProcessorTesterMixin