Uniformize kwargs for image-text-to-text processors (#32544)

* uniformize FUYU processor kwargs * Uniformize instructblip processor kwargs * Fix processor kwargs and tests Fuyu, InstructBlip, Kosmos2 * Uniformize llava_next processor * Fix save_load test for processor with chat_template only as extra init args * Fix import Unpack * Fix Fuyu Processor import * Fix FuyuProcessor import * Fix FuyuProcessor * Add defaults for specific kwargs kosmos2 * Fix Udop to return BatchFeature instead of BatchEncoding and uniformize kwargs * Add tests processor Udop * remove Copied from in processing Udop as change of input orders caused by BatchEncoding -> BatchFeature * Fix overwrite tests kwargs processors * Add warnings and BC for changes in processor inputs order, change docs, add BC for text_pair as arg for Udop * Fix processing test fuyu * remove unnecessary pad_token check in instructblip ProcessorTest * Fix BC tests and cleanup * FIx imports fuyu * Uniformize Pix2Struct * Fix wrong name for FuyuProcessorKwargs * Fix slow tests reversed inputs align fuyu llava-next, change udop warning * Fix wrong logging import udop * Add check images text input order * Fix copies * change text pair handling when positional arg * rebase on main, fix imports in test_processing_common * remove optional args and udop uniformization from this PR * fix failing tests * remove unnecessary test, fix processing utils and test processing common * cleanup Unpack * cleanup * fix conflict grounding dino
2024-09-24 21:28:19 -04:00
parent fa0bb0fe76
commit 5f0c181f4e
24 changed files with 763 additions and 852 deletions
--- a/tests/models/llava_next/test_modeling_llava_next.py
+++ b/tests/models/llava_next/test_modeling_llava_next.py
@@ -338,7 +338,7 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
            load_in_4bit=True,
        )

-        inputs = self.processor(self.prompt, self.image, return_tensors="pt")
+        inputs = self.processor(images=self.image, text=self.prompt, return_tensors="pt")

        # verify inputs against original implementation
        filepath = hf_hub_download(
@@ -390,8 +390,8 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
        cats_image = Image.open(requests.get(url, stream=True).raw)

        inputs = self.processor(
-            [self.prompt, self.prompt],
            images=[self.image, cats_image],
+            text=[self.prompt, self.prompt],
            return_tensors="pt",
            padding=True,
        ).to(torch_device)
@@ -415,7 +415,7 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
        )

        prompt_with_unk = "[INST] <image>\nWhat is shown in this <unk> image? [/INST]"
-        inputs = self.processor(prompt_with_unk, self.image, return_tensors="pt")
+        inputs = self.processor(images=self.image, text=prompt_with_unk, return_tensors="pt")

        # verify single forward pass
        inputs = inputs.to(torch_device)
@@ -445,7 +445,7 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
        lowres_img = Image.open(requests.get(lowres_url, stream=True).raw)

        inputs = self.processor(
-            [self.prompt, self.prompt], images=[lowres_img, cats_image], return_tensors="pt", padding=True
+            images=[lowres_img, cats_image], text=[self.prompt, self.prompt], return_tensors="pt", padding=True
        ).to(torch_device)
        pixel_values = inputs["pixel_values"]

@@ -498,10 +498,10 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
        lowres_img = Image.open(requests.get(lowres_url, stream=True).raw)

        inputs_batched = self.processor(
-            [self.prompt, self.prompt], images=[lowres_img, cats_image], return_tensors="pt", padding=True
+            images=[lowres_img, cats_image], text=[self.prompt, self.prompt], return_tensors="pt", padding=True
        ).to(torch_device)

-        inputs_single = self.processor(self.prompt, images=lowres_img, return_tensors="pt", padding=True).to(
+        inputs_single = self.processor(images=lowres_img, text=self.prompt, return_tensors="pt", padding=True).to(
            torch_device
        )

@@ -527,7 +527,7 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
        lowres_img = Image.open(requests.get(lowres_url, stream=True).raw)

        inputs_batched = self.processor(
-            [self.prompt, self.prompt], images=[lowres_img, cats_image], return_tensors="pt", padding=True
+            images=[lowres_img, cats_image], text=[self.prompt, self.prompt], return_tensors="pt", padding=True
        ).to(torch_device)

        # model is in eval mode by default so we should get pad on the left side
@@ -607,13 +607,13 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
        # check processing with expansion of inputs
        processor.vision_feature_select_strategy = "default"
        processor.patch_size = 14
-        inputs_expanded = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+        inputs_expanded = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
        self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2356)

        # check processing without expansion of inputs (legacy behavior)
        processor.vision_feature_select_strategy = None
        processor.patch_size = None
-        inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+        inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
        self.assertTrue(inputs.input_ids.shape[-1] == 17)

        # generate exactly 20 tokens
--- a/tests/models/llava_next/test_processor_llava_next.py
+++ b/tests/models/llava_next/test_processor_llava_next.py
@@ -18,7 +18,9 @@ import unittest
 import torch

 from transformers import AutoProcessor, LlamaTokenizerFast, LlavaNextProcessor
-from transformers.testing_utils import require_vision
+from transformers.testing_utils import (
+    require_vision,
+)
 from transformers.utils import is_vision_available

 from ...test_processing_common import ProcessorTesterMixin