Uniformize kwargs for image-text-to-text processors (#32544)
* uniformize FUYU processor kwargs * Uniformize instructblip processor kwargs * Fix processor kwargs and tests Fuyu, InstructBlip, Kosmos2 * Uniformize llava_next processor * Fix save_load test for processor with chat_template only as extra init args * Fix import Unpack * Fix Fuyu Processor import * Fix FuyuProcessor import * Fix FuyuProcessor * Add defaults for specific kwargs kosmos2 * Fix Udop to return BatchFeature instead of BatchEncoding and uniformize kwargs * Add tests processor Udop * remove Copied from in processing Udop as change of input orders caused by BatchEncoding -> BatchFeature * Fix overwrite tests kwargs processors * Add warnings and BC for changes in processor inputs order, change docs, add BC for text_pair as arg for Udop * Fix processing test fuyu * remove unnecessary pad_token check in instructblip ProcessorTest * Fix BC tests and cleanup * FIx imports fuyu * Uniformize Pix2Struct * Fix wrong name for FuyuProcessorKwargs * Fix slow tests reversed inputs align fuyu llava-next, change udop warning * Fix wrong logging import udop * Add check images text input order * Fix copies * change text pair handling when positional arg * rebase on main, fix imports in test_processing_common * remove optional args and udop uniformization from this PR * fix failing tests * remove unnecessary test, fix processing utils and test processing common * cleanup Unpack * cleanup * fix conflict grounding dino
This commit is contained in:
@@ -338,7 +338,7 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
load_in_4bit=True,
|
||||
)
|
||||
|
||||
inputs = self.processor(self.prompt, self.image, return_tensors="pt")
|
||||
inputs = self.processor(images=self.image, text=self.prompt, return_tensors="pt")
|
||||
|
||||
# verify inputs against original implementation
|
||||
filepath = hf_hub_download(
|
||||
@@ -390,8 +390,8 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
cats_image = Image.open(requests.get(url, stream=True).raw)
|
||||
|
||||
inputs = self.processor(
|
||||
[self.prompt, self.prompt],
|
||||
images=[self.image, cats_image],
|
||||
text=[self.prompt, self.prompt],
|
||||
return_tensors="pt",
|
||||
padding=True,
|
||||
).to(torch_device)
|
||||
@@ -415,7 +415,7 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
)
|
||||
|
||||
prompt_with_unk = "[INST] <image>\nWhat is shown in this <unk> image? [/INST]"
|
||||
inputs = self.processor(prompt_with_unk, self.image, return_tensors="pt")
|
||||
inputs = self.processor(images=self.image, text=prompt_with_unk, return_tensors="pt")
|
||||
|
||||
# verify single forward pass
|
||||
inputs = inputs.to(torch_device)
|
||||
@@ -445,7 +445,7 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
lowres_img = Image.open(requests.get(lowres_url, stream=True).raw)
|
||||
|
||||
inputs = self.processor(
|
||||
[self.prompt, self.prompt], images=[lowres_img, cats_image], return_tensors="pt", padding=True
|
||||
images=[lowres_img, cats_image], text=[self.prompt, self.prompt], return_tensors="pt", padding=True
|
||||
).to(torch_device)
|
||||
pixel_values = inputs["pixel_values"]
|
||||
|
||||
@@ -498,10 +498,10 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
lowres_img = Image.open(requests.get(lowres_url, stream=True).raw)
|
||||
|
||||
inputs_batched = self.processor(
|
||||
[self.prompt, self.prompt], images=[lowres_img, cats_image], return_tensors="pt", padding=True
|
||||
images=[lowres_img, cats_image], text=[self.prompt, self.prompt], return_tensors="pt", padding=True
|
||||
).to(torch_device)
|
||||
|
||||
inputs_single = self.processor(self.prompt, images=lowres_img, return_tensors="pt", padding=True).to(
|
||||
inputs_single = self.processor(images=lowres_img, text=self.prompt, return_tensors="pt", padding=True).to(
|
||||
torch_device
|
||||
)
|
||||
|
||||
@@ -527,7 +527,7 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
lowres_img = Image.open(requests.get(lowres_url, stream=True).raw)
|
||||
|
||||
inputs_batched = self.processor(
|
||||
[self.prompt, self.prompt], images=[lowres_img, cats_image], return_tensors="pt", padding=True
|
||||
images=[lowres_img, cats_image], text=[self.prompt, self.prompt], return_tensors="pt", padding=True
|
||||
).to(torch_device)
|
||||
|
||||
# model is in eval mode by default so we should get pad on the left side
|
||||
@@ -607,13 +607,13 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
# check processing with expansion of inputs
|
||||
processor.vision_feature_select_strategy = "default"
|
||||
processor.patch_size = 14
|
||||
inputs_expanded = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
|
||||
inputs_expanded = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
|
||||
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2356)
|
||||
|
||||
# check processing without expansion of inputs (legacy behavior)
|
||||
processor.vision_feature_select_strategy = None
|
||||
processor.patch_size = None
|
||||
inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
|
||||
inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
|
||||
self.assertTrue(inputs.input_ids.shape[-1] == 17)
|
||||
|
||||
# generate exactly 20 tokens
|
||||
|
||||
@@ -18,7 +18,9 @@ import unittest
|
||||
import torch
|
||||
|
||||
from transformers import AutoProcessor, LlamaTokenizerFast, LlavaNextProcessor
|
||||
from transformers.testing_utils import require_vision
|
||||
from transformers.testing_utils import (
|
||||
require_vision,
|
||||
)
|
||||
from transformers.utils import is_vision_available
|
||||
|
||||
from ...test_processing_common import ProcessorTesterMixin
|
||||
|
||||
Reference in New Issue
Block a user