Uniformize kwargs for image-text-to-text processors (#32544)

* uniformize FUYU processor kwargs * Uniformize instructblip processor kwargs * Fix processor kwargs and tests Fuyu, InstructBlip, Kosmos2 * Uniformize llava_next processor * Fix save_load test for processor with chat_template only as extra init args * Fix import Unpack * Fix Fuyu Processor import * Fix FuyuProcessor import * Fix FuyuProcessor * Add defaults for specific kwargs kosmos2 * Fix Udop to return BatchFeature instead of BatchEncoding and uniformize kwargs * Add tests processor Udop * remove Copied from in processing Udop as change of input orders caused by BatchEncoding -> BatchFeature * Fix overwrite tests kwargs processors * Add warnings and BC for changes in processor inputs order, change docs, add BC for text_pair as arg for Udop * Fix processing test fuyu * remove unnecessary pad_token check in instructblip ProcessorTest * Fix BC tests and cleanup * FIx imports fuyu * Uniformize Pix2Struct * Fix wrong name for FuyuProcessorKwargs * Fix slow tests reversed inputs align fuyu llava-next, change udop warning * Fix wrong logging import udop * Add check images text input order * Fix copies * change text pair handling when positional arg * rebase on main, fix imports in test_processing_common * remove optional args and udop uniformization from this PR * fix failing tests * remove unnecessary test, fix processing utils and test processing common * cleanup Unpack * cleanup * fix conflict grounding dino
2024-09-24 21:28:19 -04:00
parent fa0bb0fe76
commit 5f0c181f4e
24 changed files with 763 additions and 852 deletions
--- a/tests/models/align/test_modeling_align.py
+++ b/tests/models/align/test_modeling_align.py
@@ -626,7 +626,7 @@ class AlignModelIntegrationTest(unittest.TestCase):

        image = prepare_img()
        texts = ["a photo of a cat", "a photo of a dog"]
-        inputs = processor(text=texts, images=image, return_tensors="pt").to(torch_device)
+        inputs = processor(images=image, text=texts, return_tensors="pt").to(torch_device)

        # forward pass
        with torch.no_grad():
--- a/tests/models/fuyu/test_modeling_fuyu.py
+++ b/tests/models/fuyu/test_modeling_fuyu.py
@@ -330,7 +330,7 @@ class FuyuModelIntegrationTest(unittest.TestCase):

        text_prompt_coco_captioning = "Generate a coco-style caption.\n"

-        inputs = processor(text=text_prompt_coco_captioning, images=image, return_tensors="pt")
+        inputs = processor(images=image, text=text_prompt_coco_captioning, return_tensors="pt")
        generated_ids = model.generate(**inputs, max_new_tokens=10)

        # take the last 8 tokens (in order to skip special \n\x04 characters) and decode them
--- a/tests/models/fuyu/test_processing_fuyu.py
+++ b/tests/models/fuyu/test_processing_fuyu.py
@@ -1,17 +1,25 @@
 import io
+import tempfile
 import unittest

 import requests

-from transformers import AutoTokenizer, is_torch_available, is_vision_available
-from transformers.testing_utils import require_torch, require_torch_gpu, slow
+from transformers import (
+    AutoProcessor,
+    AutoTokenizer,
+    FuyuImageProcessor,
+    FuyuProcessor,
+    is_torch_available,
+    is_vision_available,
+)
+from transformers.testing_utils import require_torch, require_vision
+
+from ...test_processing_common import ProcessorTesterMixin


 if is_vision_available():
    from PIL import Image

-if is_vision_available() and is_torch_available():
-    from transformers import FuyuImageProcessor, FuyuProcessor

 if is_torch_available():
    import torch
@@ -20,21 +28,36 @@ if is_torch_available():


@require_torch
-@require_torch_gpu
-@slow
-class FuyuProcessingTest(unittest.TestCase):  # TODO Which mixins do we add here?
-    """ """
+@require_vision
+class FuyuProcessingTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = FuyuProcessor

    def setUp(self):
-        pretrained_model_name = "adept/fuyu-8b"
-        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
-        self.image_processor = FuyuImageProcessor()
+        self.tmpdirname = tempfile.mkdtemp()
+
+        image_processor = FuyuImageProcessor()
+        tokenizer = AutoTokenizer.from_pretrained("adept/fuyu-8b")
+
+        processor = FuyuProcessor(image_processor=image_processor, tokenizer=tokenizer)
+        processor.save_pretrained(self.tmpdirname)

-        self.processor = FuyuProcessor(image_processor=self.image_processor, tokenizer=self.tokenizer)
        self.text_prompt = "Generate a coco-style caption.\\n"
        bus_image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
        self.bus_image_pil = Image.open(io.BytesIO(requests.get(bus_image_url).content))

+    def get_processor(self):
+        image_processor = FuyuImageProcessor()
+        tokenizer = AutoTokenizer.from_pretrained("adept/fuyu-8b")
+        processor = FuyuProcessor(image_processor, tokenizer, **self.prepare_processor_dict())
+
+        return processor
+
+    def get_tokenizer(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
+
+    def get_image_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
    def test_fuyu_processing(self):
        """
        Test to ensure that the standard processing on a gold example matches adept's code.
@@ -43,7 +66,7 @@ class FuyuProcessingTest(unittest.TestCase):  # TODO Which mixins do we add here
        EXPECTED_IMAGE_PATCH_INPUTS = torch.Tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, -1, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, -1, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, -1, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, -1, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, -1, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, -1, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, -1, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, -1, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, -1, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, -1, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, -1, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, -1, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, -1, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,]]).to(torch.int64)
        EXPECTED_PADDED_UNPACKED_TOKEN_INPUTS = torch.Tensor([[71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 1, 128340, 71374, 71389, 120412, 71377, 71835, 71374, 73615, 71375, 71399, 71435, 71122,]]).to(torch.int64)

-        one_image_bus_model_inputs = self.processor(text=self.text_prompt, images=self.bus_image_pil)
+        one_image_bus_model_inputs = self.get_processor()(text=self.text_prompt, images=self.bus_image_pil)

        # fmt: on
        torch.testing.assert_close(one_image_bus_model_inputs["image_patches_indices"], EXPECTED_IMAGE_PATCH_INPUTS)
@@ -53,8 +76,8 @@ class FuyuProcessingTest(unittest.TestCase):  # TODO Which mixins do we add here
        """
        Test to check processor works with just text input
        """
-        processor_outputs = self.processor(text=self.text_prompt)
-        tokenizer_outputs = self.tokenizer(self.text_prompt)
+        processor_outputs = self.get_processor()(text=self.text_prompt)
+        tokenizer_outputs = self.get_tokenizer()(self.text_prompt)
        self.assertEqual(processor_outputs["input_ids"], tokenizer_outputs["input_ids"])

    def test_fuyu_processing_no_text(self):
@@ -90,7 +113,7 @@ class FuyuProcessingTest(unittest.TestCase):  # TODO Which mixins do we add here
        ]).to(torch.int64)
        # fmt: on

-        processor_outputs = self.processor(images=self.bus_image_pil)
+        processor_outputs = self.get_processor()(images=self.bus_image_pil)
        self.assertTrue((processor_outputs["image_patches_indices"] == EXPECTED_IMAGE_PATCH_INPUTS).all())

    def test_fuyu_processing_multiple_image_sample(self):
@@ -107,7 +130,7 @@ class FuyuProcessingTest(unittest.TestCase):  # TODO Which mixins do we add here

        # Batch of two images - equally sized
        images = [self.bus_image_pil, self.bus_image_pil]
-        processor_outputs = self.processor(text=[self.text_prompt, self.text_prompt], images=images)
+        processor_outputs = self.get_processor()(text=[self.text_prompt, self.text_prompt], images=images)

        self.assertTrue(
            (
@@ -124,18 +147,18 @@ class FuyuProcessingTest(unittest.TestCase):  # TODO Which mixins do we add here

        # Processes single images with different sizes as expected
        images = [self.bus_image_pil]
-        processor_outputs = self.processor(text=self.text_prompt, images=images)
+        processor_outputs = self.get_processor()(text=self.text_prompt, images=images)
        self.assertTrue((processor_outputs["image_patches_indices"] == SINGLE_IMAGE_PATCH_INPUTS).all())
        self.assertTrue((processor_outputs["input_ids"] == SINGLE_PADDED_UNPACKED_TOKEN_INPUTS).all())

        images = [self.bus_image_pil.resize((64, 300))]
-        processor_outputs = self.processor(text=self.text_prompt, images=images)
+        processor_outputs = self.get_processor()(text=self.text_prompt, images=images)
        self.assertTrue((processor_outputs["image_patches_indices"] == SINGLE_RESIZED_IMAGE_PATCH_INPUTS).all())
        self.assertTrue((processor_outputs["input_ids"] == SINGLE_RESIZED_PADDED_UNPACKED_TOKEN_INPUTS).all())

        # Batch of two images - different sizes. Left-pads the smaller image inputs
        images = [self.bus_image_pil, self.bus_image_pil.resize((64, 300))]
-        processor_outputs = self.processor(text=[self.text_prompt, self.text_prompt], images=images)
+        processor_outputs = self.get_processor()(text=[self.text_prompt, self.text_prompt], images=images)

        padding_len_patch = SINGLE_IMAGE_PATCH_INPUTS.shape[1] - SINGLE_RESIZED_IMAGE_PATCH_INPUTS.shape[1]
        padded_single_resized_image_patch = torch.cat(
@@ -156,6 +179,155 @@ class FuyuProcessingTest(unittest.TestCase):  # TODO Which mixins do we add here
        self.assertTrue((processor_outputs["image_patches_indices"] == expected_image_patch_inputs).all())
        self.assertTrue((processor_outputs["input_ids"] == expected_padded_unpacked_token_inputs).all())

+    # Rewrite as Fuyu supports tokenizer kwargs only when image is None.
+    @require_vision
+    @require_torch
+    def test_kwargs_overrides_default_tokenizer_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        # Fuyu uses tokenizer kwargs only when image is None.
+        image_input = None
+
+        inputs = processor(
+            text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
+        )
+        self.assertEqual(len(inputs["input_ids"][0]), 112)
+
+    @unittest.skip("Fuyu processor does not support image_processor kwargs")
+    def test_image_processor_defaults_preserved_by_image_kwargs(self):
+        pass
+
+    @unittest.skip("Fuyu processor does not support image_processor kwargs")
+    def test_kwargs_overrides_default_image_processor_kwargs(self):
+        pass
+
+    # Rewrite as Fuyu supports tokenizer kwargs only when image is None.
+    @require_vision
+    @require_torch
+    def test_tokenizer_defaults_preserved_by_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        # Fuyu uses tokenizer kwargs only when image is None.
+        image_input = None
+
+        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
+        self.assertEqual(len(inputs["input_ids"][0]), 117)
+
+    # Rewrite as Fuyu image processor does not return pixel values
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        # Fuyu uses tokenizer kwargs only when image is None.
+        image_input = None
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    # Rewrite as Fuyu image processor does not return pixel values
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested_from_dict(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        # Fuyu uses tokenizer kwargs only when image is None.
+        image_input = None
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    # Rewrite as Fuyu supports tokenizer kwargs only when image is None.
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        # Fuyu uses tokenizer kwargs only when image is None.
+        image_input = None
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            padding="max_length",
+            max_length=76,
+        )
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    # Rewrite as Fuyu supports tokenizer kwargs only when image is None.
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer", "upper older longer string"]
+        # Fuyu uses tokenizer kwargs only when image is None.
+        image_input = None
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            padding="longest",
+            max_length=76,
+        )
+
+        self.assertEqual(len(inputs["input_ids"][0]), 6)
+

@require_torch
 class TestImageTextProcessingUtils(unittest.TestCase):
--- a/tests/models/instructblip/test_processor_instructblip.py
+++ b/tests/models/instructblip/test_processor_instructblip.py
@@ -17,7 +17,7 @@ import unittest

 import pytest

-from transformers.testing_utils import require_torch, require_vision
+from transformers.testing_utils import require_vision
 from transformers.utils import is_vision_available

 from ...test_processing_common import ProcessorTesterMixin
@@ -179,261 +179,3 @@ class InstructBlipProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            list(inputs.keys()),
            ["input_ids", "attention_mask", "qformer_input_ids", "qformer_attention_mask", "pixel_values"],
        )
-
-    # Override as InstructBlipProcessor has qformer_tokenizer
-    @require_vision
-    @require_torch
-    def test_tokenizer_defaults_preserved_by_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
-        qformer_tokenizer = self.get_component("qformer_tokenizer", max_length=117, padding="max_length")
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
-        self.assertEqual(len(inputs["input_ids"][0]), 117)
-
-    # Override as InstructBlipProcessor has qformer_tokenizer
-    @require_torch
-    @require_vision
-    def test_image_processor_defaults_preserved_by_image_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor", size=(234, 234))
-        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
-        qformer_tokenizer = self.get_component("qformer_tokenizer", max_length=117, padding="max_length")
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-        self.assertEqual(len(inputs["pixel_values"][0][0]), 234)
-
-    # Override as InstructBlipProcessor has qformer_tokenizer
-    @require_vision
-    @require_torch
-    def test_kwargs_overrides_default_tokenizer_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer", padding="longest")
-        qformer_tokenizer = self.get_component("qformer_tokenizer", padding="longest")
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(
-            text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
-        )
-        self.assertEqual(len(inputs["input_ids"][0]), 112)
-
-    # Override as InstructBlipProcessor has qformer_tokenizer
-    @require_torch
-    @require_vision
-    def test_kwargs_overrides_default_image_processor_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor", size=(234, 234))
-        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
-        qformer_tokenizer = self.get_component("qformer_tokenizer", max_length=117, padding="max_length")
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input, size=[224, 224])
-        self.assertEqual(len(inputs["pixel_values"][0][0]), 224)
-
-    # Override as InstructBlipProcessor has qformer_tokenizer
-    @require_torch
-    @require_vision
-    def test_unstructured_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-        qformer_tokenizer = self.get_component("qformer_tokenizer")
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-        inputs = processor(
-            text=input_str,
-            images=image_input,
-            return_tensors="pt",
-            size={"height": 214, "width": 214},
-            padding="max_length",
-            max_length=76,
-        )
-
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
-
-    # Override as InstructBlipProcessor has qformer_tokenizer
-    @require_torch
-    @require_vision
-    def test_unstructured_kwargs_batched(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-        qformer_tokenizer = self.get_component("qformer_tokenizer")
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = ["lower newer", "upper older longer string"]
-        image_input = self.prepare_image_inputs() * 2
-        inputs = processor(
-            text=input_str,
-            images=image_input,
-            return_tensors="pt",
-            size={"height": 214, "width": 214},
-            padding="longest",
-            max_length=76,
-        )
-
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
-
-        self.assertEqual(len(inputs["input_ids"][0]), 6)
-
-    # Override as InstructBlipProcessor has qformer_tokenizer
-    @require_torch
-    @require_vision
-    def test_doubly_passed_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-        qformer_tokenizer = self.get_component("qformer_tokenizer")
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = ["lower newer"]
-        image_input = self.prepare_image_inputs()
-        with self.assertRaises(ValueError):
-            _ = processor(
-                text=input_str,
-                images=image_input,
-                images_kwargs={"size": {"height": 222, "width": 222}},
-                size={"height": 214, "width": 214},
-            )
-
-    # Override as InstructBlipProcessor has qformer_tokenizer
-    @require_torch
-    @require_vision
-    def test_structured_kwargs_nested(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-        qformer_tokenizer = self.get_component("qformer_tokenizer")
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        # Define the kwargs for each modality
-        all_kwargs = {
-            "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"size": {"height": 214, "width": 214}},
-            "text_kwargs": {"padding": "max_length", "max_length": 76},
-        }
-
-        inputs = processor(text=input_str, images=image_input, **all_kwargs)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
-
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
-
-    # Override as InstructBlipProcessor has qformer_tokenizer
-    @require_torch
-    @require_vision
-    def test_structured_kwargs_nested_from_dict(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-        qformer_tokenizer = self.get_component("qformer_tokenizer")
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        # Define the kwargs for each modality
-        all_kwargs = {
-            "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"size": {"height": 214, "width": 214}},
-            "text_kwargs": {"padding": "max_length", "max_length": 76},
-        }
-
-        inputs = processor(text=input_str, images=image_input, **all_kwargs)
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
-
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
-
-    def test_overlapping_text_kwargs_handling(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        processor_kwargs = {}
-        processor_kwargs["image_processor"] = self.get_component("image_processor")
-        processor_kwargs["tokenizer"] = tokenizer = self.get_component("tokenizer")
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
-        if "video_processor" in self.processor_class.attributes:
-            processor_kwargs["video_processor"] = self.get_component("video_processor")
-
-        qformer_tokenizer = self.get_component("qformer_tokenizer")
-
-        processor = self.processor_class(**processor_kwargs, qformer_tokenizer=qformer_tokenizer)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        with self.assertRaises(ValueError):
-            _ = processor(
-                text=input_str,
-                images=image_input,
-                return_tensors="pt",
-                padding="max_length",
-                text_kwargs={"padding": "do_not_pad"},
-            )
--- a/tests/models/instructblipvideo/test_processor_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_processor_instructblipvideo.py
@@ -15,18 +15,15 @@ import shutil
 import tempfile
 import unittest

-import numpy as np
 import pytest

-from transformers.testing_utils import require_torch, require_vision
+from transformers.testing_utils import require_vision
 from transformers.utils import is_vision_available

 from ...test_processing_common import ProcessorTesterMixin


 if is_vision_available():
-    from PIL import Image
-
    from transformers import (
        AutoProcessor,
        BertTokenizerFast,
@@ -65,16 +62,6 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def tearDown(self):
        shutil.rmtree(self.tmpdirname)

-    # Ignore copy
-    def prepare_image_inputs(self):
-        """This function prepares a list of list of PIL images"""
-
-        video_inputs = [
-            [Image.fromarray(np.random.randint(255, size=(30, 400, 3), dtype=np.uint8)) for _ in range(5)]
-            for _ in range(2)
-        ]
-        return video_inputs
-
    def test_save_load_pretrained_additional_features(self):
        processor = InstructBlipVideoProcessor(
            tokenizer=self.get_tokenizer(),
@@ -193,261 +180,3 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            list(inputs.keys()),
            ["input_ids", "attention_mask", "qformer_input_ids", "qformer_attention_mask", "pixel_values"],
        )
-
-    # Override as InstructBlipVideoProcessor has qformer_tokenizer
-    @require_vision
-    @require_torch
-    def test_tokenizer_defaults_preserved_by_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
-        qformer_tokenizer = self.get_component("qformer_tokenizer", max_length=117, padding="max_length")
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
-        self.assertEqual(len(inputs["input_ids"][0]), 117)
-
-    # Override as InstructBlipVideoProcessor has qformer_tokenizer
-    @require_torch
-    @require_vision
-    def test_image_processor_defaults_preserved_by_image_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor", size=(234, 234))
-        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
-        qformer_tokenizer = self.get_component("qformer_tokenizer", max_length=117, padding="max_length")
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-        self.assertEqual(len(inputs["pixel_values"][0][0]), 234)
-
-    # Override as InstructBlipVideoProcessor has qformer_tokenizer
-    @require_vision
-    @require_torch
-    def test_kwargs_overrides_default_tokenizer_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer", padding="longest")
-        qformer_tokenizer = self.get_component("qformer_tokenizer", padding="longest")
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(
-            text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
-        )
-        self.assertEqual(len(inputs["input_ids"][0]), 112)
-
-    # Override as InstructBlipVideoProcessor has qformer_tokenizer
-    @require_torch
-    @require_vision
-    def test_kwargs_overrides_default_image_processor_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor", size=(234, 234))
-        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
-        qformer_tokenizer = self.get_component("qformer_tokenizer", max_length=117, padding="max_length")
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input, size=[224, 224])
-        self.assertEqual(len(inputs["pixel_values"][0][0]), 224)
-
-    # Override as InstructBlipVideoProcessor has qformer_tokenizer
-    @require_torch
-    @require_vision
-    def test_unstructured_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-        qformer_tokenizer = self.get_component("qformer_tokenizer")
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-        inputs = processor(
-            text=input_str,
-            images=image_input,
-            return_tensors="pt",
-            size={"height": 214, "width": 214},
-            padding="max_length",
-            max_length=76,
-        )
-
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
-
-    # Override as InstructBlipVideoProcessor has qformer_tokenizer
-    @require_torch
-    @require_vision
-    def test_unstructured_kwargs_batched(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-        qformer_tokenizer = self.get_component("qformer_tokenizer")
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = ["lower newer", "upper older longer string"]
-        image_input = self.prepare_image_inputs() * 2
-        inputs = processor(
-            text=input_str,
-            images=image_input,
-            return_tensors="pt",
-            size={"height": 214, "width": 214},
-            padding="longest",
-            max_length=76,
-        )
-
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
-
-        self.assertEqual(len(inputs["input_ids"][0]), 6)
-
-    # Override as InstructBlipVideoProcessor has qformer_tokenizer
-    @require_torch
-    @require_vision
-    def test_doubly_passed_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-        qformer_tokenizer = self.get_component("qformer_tokenizer")
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = ["lower newer"]
-        image_input = self.prepare_image_inputs()
-        with self.assertRaises(ValueError):
-            _ = processor(
-                text=input_str,
-                images=image_input,
-                images_kwargs={"size": {"height": 222, "width": 222}},
-                size={"height": 214, "width": 214},
-            )
-
-    # Override as InstructBlipVideoProcessor has qformer_tokenizer
-    @require_torch
-    @require_vision
-    def test_structured_kwargs_nested(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-        qformer_tokenizer = self.get_component("qformer_tokenizer")
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        # Define the kwargs for each modality
-        all_kwargs = {
-            "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"size": {"height": 214, "width": 214}},
-            "text_kwargs": {"padding": "max_length", "max_length": 76},
-        }
-
-        inputs = processor(text=input_str, images=image_input, **all_kwargs)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
-
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
-
-    # Override as InstructBlipVideoProcessor has qformer_tokenizer
-    @require_torch
-    @require_vision
-    def test_structured_kwargs_nested_from_dict(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-        qformer_tokenizer = self.get_component("qformer_tokenizer")
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        # Define the kwargs for each modality
-        all_kwargs = {
-            "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"size": {"height": 214, "width": 214}},
-            "text_kwargs": {"padding": "max_length", "max_length": 76},
-        }
-
-        inputs = processor(text=input_str, images=image_input, **all_kwargs)
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
-
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
-
-    def test_overlapping_text_kwargs_handling(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        processor_kwargs = {}
-        processor_kwargs["image_processor"] = self.get_component("image_processor")
-        processor_kwargs["tokenizer"] = tokenizer = self.get_component("tokenizer")
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
-        if "video_processor" in self.processor_class.attributes:
-            processor_kwargs["video_processor"] = self.get_component("video_processor")
-
-        qformer_tokenizer = self.get_component("qformer_tokenizer")
-
-        processor = self.processor_class(**processor_kwargs, qformer_tokenizer=qformer_tokenizer)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        with self.assertRaises(ValueError):
-            _ = processor(
-                text=input_str,
-                images=image_input,
-                return_tensors="pt",
-                padding="max_length",
-                text_kwargs={"padding": "do_not_pad"},
-            )
--- a/tests/models/kosmos2/test_processor_kosmos2.py
+++ b/tests/models/kosmos2/test_processor_kosmos2.py
@@ -61,7 +61,7 @@ class Kosmos2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def setUp(self):
        self.tmpdirname = tempfile.mkdtemp()

-        image_processor = CLIPImageProcessor()
+        image_processor = CLIPImageProcessor(do_center_crop=False)

        # We have a SentencePiece fixture for testing
        slow_tokenizer = XLMRobertaTokenizer(SAMPLE_VOCAB)
@@ -487,3 +487,147 @@ class Kosmos2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertListEqual(outputs.input_ids.numpy().tolist()[-1], EXPECTED_IDS_BATCH[-1])
        self.assertListEqual(outputs.attention_mask.numpy().tolist()[-1], EXPECTED_MASK_BATCH[-1])
        self.assertListEqual(outputs.image_embeds_position_mask.numpy().tolist()[-1], EXPECTED_IMG_POS_MASK_BATCH[-1])
+
+    # Rewrite as Kosmos-2 supports custom padding only when image is None.
+    @require_vision
+    @require_torch
+    def test_kwargs_overrides_default_tokenizer_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        # set image input to None
+        image_input = None
+
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            max_length=112,
+            padding="max_length",
+        )
+
+        self.assertEqual(len(inputs["input_ids"][0]), 112)
+
+    # Rewrite to test only image_processor kwargs
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"size": {"height": 214, "width": 214}},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+    # Rewrite to test only image_processor kwargs
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested_from_dict(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"size": {"height": 214, "width": 214}},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+    # Rewrite as Kosmos-2 supports custom padding only when image is None.
+    @require_vision
+    @require_torch
+    def test_tokenizer_defaults_preserved_by_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        # set image input to None
+        image_input = None
+
+        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
+        self.assertEqual(len(inputs["input_ids"][0]), 117)
+
+    # Rewrite as Kosmos-2 supports custom padding only when image is None.
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        # set image input to None
+        image_input = None
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            padding="max_length",
+            max_length=76,
+        )
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    # Rewrite as Kosmos-2 supports custom padding only when image is None.
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer", "upper older longer string"]
+        # set image input to None
+        image_input = None
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            size={"height": 214, "width": 214},
+            padding="longest",
+            max_length=76,
+        )
+
+        self.assertEqual(len(inputs["input_ids"][0]), 10)
--- a/tests/models/llava_next/test_modeling_llava_next.py
+++ b/tests/models/llava_next/test_modeling_llava_next.py
@@ -338,7 +338,7 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
            load_in_4bit=True,
        )

-        inputs = self.processor(self.prompt, self.image, return_tensors="pt")
+        inputs = self.processor(images=self.image, text=self.prompt, return_tensors="pt")

        # verify inputs against original implementation
        filepath = hf_hub_download(
@@ -390,8 +390,8 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
        cats_image = Image.open(requests.get(url, stream=True).raw)

        inputs = self.processor(
-            [self.prompt, self.prompt],
            images=[self.image, cats_image],
+            text=[self.prompt, self.prompt],
            return_tensors="pt",
            padding=True,
        ).to(torch_device)
@@ -415,7 +415,7 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
        )

        prompt_with_unk = "[INST] <image>\nWhat is shown in this <unk> image? [/INST]"
-        inputs = self.processor(prompt_with_unk, self.image, return_tensors="pt")
+        inputs = self.processor(images=self.image, text=prompt_with_unk, return_tensors="pt")

        # verify single forward pass
        inputs = inputs.to(torch_device)
@@ -445,7 +445,7 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
        lowres_img = Image.open(requests.get(lowres_url, stream=True).raw)

        inputs = self.processor(
-            [self.prompt, self.prompt], images=[lowres_img, cats_image], return_tensors="pt", padding=True
+            images=[lowres_img, cats_image], text=[self.prompt, self.prompt], return_tensors="pt", padding=True
        ).to(torch_device)
        pixel_values = inputs["pixel_values"]

@@ -498,10 +498,10 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
        lowres_img = Image.open(requests.get(lowres_url, stream=True).raw)

        inputs_batched = self.processor(
-            [self.prompt, self.prompt], images=[lowres_img, cats_image], return_tensors="pt", padding=True
+            images=[lowres_img, cats_image], text=[self.prompt, self.prompt], return_tensors="pt", padding=True
        ).to(torch_device)

-        inputs_single = self.processor(self.prompt, images=lowres_img, return_tensors="pt", padding=True).to(
+        inputs_single = self.processor(images=lowres_img, text=self.prompt, return_tensors="pt", padding=True).to(
            torch_device
        )

@@ -527,7 +527,7 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
        lowres_img = Image.open(requests.get(lowres_url, stream=True).raw)

        inputs_batched = self.processor(
-            [self.prompt, self.prompt], images=[lowres_img, cats_image], return_tensors="pt", padding=True
+            images=[lowres_img, cats_image], text=[self.prompt, self.prompt], return_tensors="pt", padding=True
        ).to(torch_device)

        # model is in eval mode by default so we should get pad on the left side
@@ -607,13 +607,13 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
        # check processing with expansion of inputs
        processor.vision_feature_select_strategy = "default"
        processor.patch_size = 14
-        inputs_expanded = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+        inputs_expanded = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
        self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2356)

        # check processing without expansion of inputs (legacy behavior)
        processor.vision_feature_select_strategy = None
        processor.patch_size = None
-        inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+        inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
        self.assertTrue(inputs.input_ids.shape[-1] == 17)

        # generate exactly 20 tokens
--- a/tests/models/llava_next/test_processor_llava_next.py
+++ b/tests/models/llava_next/test_processor_llava_next.py
@@ -18,7 +18,9 @@ import unittest
 import torch

 from transformers import AutoProcessor, LlamaTokenizerFast, LlavaNextProcessor
-from transformers.testing_utils import require_vision
+from transformers.testing_utils import (
+    require_vision,
+)
 from transformers.utils import is_vision_available

 from ...test_processing_common import ProcessorTesterMixin
--- a/tests/models/pix2struct/test_processor_pix2struct.py
+++ b/tests/models/pix2struct/test_processor_pix2struct.py
@@ -37,6 +37,8 @@ if is_vision_available():
@require_torch
 class Pix2StructProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    processor_class = Pix2StructProcessor
+    text_input_name = "decoder_input_ids"
+    images_input_name = "flattened_patches"

    def setUp(self):
        self.tmpdirname = tempfile.mkdtemp()
@@ -180,3 +182,148 @@ class Pix2StructProcessorTest(ProcessorTesterMixin, unittest.TestCase):

        # For now the processor supports only ["flattened_patches", "input_ids", "attention_mask", "decoder_attention_mask"]
        self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask"])
+
+    @require_torch
+    @require_vision
+    def test_image_processor_defaults_preserved_by_image_kwargs(self):
+        # Rewrite as pix2struct processor return "flattened_patches" and not "pixel_values"
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", max_patches=1024, patch_size={"height": 8, "width": 8})
+        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+        self.assertEqual(len(inputs["flattened_patches"][0][0]), 194)
+
+    @require_torch
+    @require_vision
+    def test_kwargs_overrides_default_image_processor_kwargs(self):
+        # Rewrite as pix2struct processor return "flattened_patches" and not "pixel_values"
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", max_patches=4096)
+        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, max_patches=1024)
+        self.assertEqual(len(inputs["flattened_patches"][0]), 1024)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs(self):
+        # Rewrite as pix2struct processor return "decoder_input_ids" and not "input_ids"
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            max_patches=1024,
+            padding="max_length",
+            max_length=76,
+        )
+
+        self.assertEqual(inputs["flattened_patches"].shape[1], 1024)
+        self.assertEqual(len(inputs["decoder_input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        # Rewrite as pix2struct processor return "decoder_input_ids" and not "input_ids"
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer", "upper older longer string"]
+        image_input = self.prepare_image_inputs() * 2
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            max_patches=1024,
+            padding="longest",
+            max_length=76,
+        )
+
+        self.assertEqual(inputs["flattened_patches"].shape[1], 1024)
+
+        self.assertEqual(len(inputs["decoder_input_ids"][0]), 5)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested(self):
+        # Rewrite as pix2struct processor return "decoder_input_ids" and not "input_ids"
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"max_patches": 1024},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        self.assertEqual(inputs["flattened_patches"].shape[1], 1024)
+
+        self.assertEqual(len(inputs["decoder_input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested_from_dict(self):
+        # Rewrite as pix2struct processor return "decoder_input_ids" and not "input_ids"
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"max_patches": 1024},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.assertEqual(inputs["flattened_patches"].shape[1], 1024)
+
+        self.assertEqual(len(inputs["decoder_input_ids"][0]), 76)
--- a/tests/models/pixtral/test_processor_pixtral.py
+++ b/tests/models/pixtral/test_processor_pixtral.py
@@ -18,9 +18,7 @@ import unittest
 import requests
 import torch

-from transformers.testing_utils import (
-    require_vision,
-)
+from transformers.testing_utils import require_vision
 from transformers.utils import is_vision_available

 from ...test_processing_common import ProcessorTesterMixin