Add image text to text pipeline (#34170)

* Standardize image-text-to-text-models-output add post_process_image_text_to_text to chameleon and cleanup Fix legacy kwarg behavior and deprecation warning add post_process_image_text_to_text to qwen2_vl and llava_onevision Add post_process_image_text_to_text to idefics3, mllama, pixtral processor * nit var name post_process_image_text_to_text udop * nit fix deprecation warnings * Add image-text-to-text pipeline * add support for image url in chat template for pipeline * Reformat to be fully compatible with chat templates * Add tests chat template * Fix imports and tests * Add pipeline tag * change logic handling of single prompt ans multiple images * add pipeline mapping to models * fix batched inference * fix tests * Add manual batching for preprocessing * Fix outputs with nested images * Add support for all common processing kwargs * Add default padding when multiple text inputs (batch size>1) * nit change version deprecation warning * Add support for text only inference * add chat_template warnings * Add pipeline tests and add copied from post process function * Fix batched pipeline tests * nit * Fix pipeline tests blip2 * remove unnecessary max_new_tokens * revert processing kosmos2 and remove unnecessary max_new_tokens * fix pipeline tests idefics * Force try loading processor if pipeline supports it * revert load_processor change * hardcode loading only processor * remove unnecessary try except * skip imagetexttotext tests for kosmos2 as tiny model causes problems * Make code clearer * Address review comments * remove preprocessing logic from pipeline * fix fuyu * add BC resize fuyu * Move post_process_image_text_to_text to ProcessorMixin * add guard in post_process * fix zero shot object detection pipeline * add support for generator input in pipeline * nit * change default image-text-to-text model to llava onevision * fix owlv2 size dict * Change legacy deprecation warning to only show when True
2024-10-31 15:48:11 -04:00
parent c443d8d536
commit 203e27059b
47 changed files with 988 additions and 33 deletions
--- a/tests/models/blip/test_modeling_blip.py
+++ b/tests/models/blip/test_modeling_blip.py
@@ -436,6 +436,7 @@ class BlipModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
            "feature-extraction": BlipModel,
            "image-to-text": BlipForConditionalGeneration,
            "visual-question-answering": BlipForQuestionAnswering,
+            "image-text-to-text": BlipForConditionalGeneration,
        }
        if is_torch_available()
        else {}
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -767,6 +767,7 @@ class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMixi
            "feature-extraction": Blip2Model,
            "image-to-text": Blip2ForConditionalGeneration,
            "visual-question-answering": Blip2ForConditionalGeneration,
+            "image-text-to-text": Blip2ForConditionalGeneration,
        }
        if is_torch_available()
        else {}
--- a/tests/models/chameleon/test_modeling_chameleon.py
+++ b/tests/models/chameleon/test_modeling_chameleon.py
@@ -276,6 +276,7 @@ class ChameleonModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
        {
            "feature-extraction": ChameleonModel,
            "text-generation": ChameleonForConditionalGeneration,
+            "image-text-to-text": ChameleonForConditionalGeneration,
        }
        if is_torch_available()
        else {}
--- a/tests/models/fuyu/test_modeling_fuyu.py
+++ b/tests/models/fuyu/test_modeling_fuyu.py
@@ -265,7 +265,9 @@ class FuyuModelTester:
@require_torch
 class FuyuModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (FuyuForCausalLM,) if is_torch_available() else ()
-    pipeline_model_mapping = {"text-generation": FuyuForCausalLM} if is_torch_available() else {}
+    pipeline_model_mapping = (
+        {"text-generation": FuyuForCausalLM, "image-text-to-text": FuyuForCausalLM} if is_torch_available() else {}
+    )

    test_head_masking = False
    test_pruning = False
--- a/tests/models/git/test_modeling_git.py
+++ b/tests/models/git/test_modeling_git.py
@@ -401,7 +401,12 @@ class GitModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
    all_model_classes = (GitModel, GitForCausalLM) if is_torch_available() else ()
    all_generative_model_classes = (GitForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
-        {"feature-extraction": GitModel, "image-to-text": GitForCausalLM, "text-generation": GitForCausalLM}
+        {
+            "feature-extraction": GitModel,
+            "image-to-text": GitForCausalLM,
+            "text-generation": GitForCausalLM,
+            "image-text-to-text": GitForCausalLM,
+        }
        if is_torch_available()
        else {}
    )
--- a/tests/models/idefics/test_modeling_idefics.py
+++ b/tests/models/idefics/test_modeling_idefics.py
@@ -332,7 +332,11 @@ class IdeficsModelTester:
@require_torch
 class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (IdeficsModel, IdeficsForVisionText2Text) if is_torch_available() else ()
-    pipeline_model_mapping = {"feature-extraction": IdeficsModel} if is_torch_available() else {}
+    pipeline_model_mapping = (
+        {"feature-extraction": IdeficsModel, "image-text-to-text": IdeficsForVisionText2Text}
+        if is_torch_available()
+        else {}
+    )
    test_pruning = False
    test_headmasking = False
    test_torchscript = False
--- a/tests/models/idefics2/test_modeling_idefics2.py
+++ b/tests/models/idefics2/test_modeling_idefics2.py
@@ -375,6 +375,7 @@ class Idefics2ForConditionalGenerationModelTest(GenerationTesterMixin, ModelTest

    all_model_classes = (Idefics2ForConditionalGeneration,) if is_torch_available() else ()
    all_generative_model_classes = (Idefics2ForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-text-to-text": Idefics2ForConditionalGeneration} if is_torch_available() else ()
    fx_compatible = False
    test_pruning = False
    test_resize_embeddings = True
--- a/tests/models/idefics3/test_modeling_idefics3.py
+++ b/tests/models/idefics3/test_modeling_idefics3.py
@@ -317,6 +317,7 @@ class Idefics3ForConditionalGenerationModelTest(GenerationTesterMixin, ModelTest

    all_model_classes = (Idefics3ForConditionalGeneration,) if is_torch_available() else ()
    all_generative_model_classes = (Idefics3ForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-text-to-text": Idefics3ForConditionalGeneration} if is_torch_available() else ()
    fx_compatible = False
    test_pruning = False
    test_resize_embeddings = True
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -455,6 +455,7 @@ class InstructBlipForConditionalGenerationDecoderOnlyModelTester:
@require_torch
 class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
    all_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-text-to-text": InstructBlipForConditionalGeneration}
    fx_compatible = False
    test_head_masking = False
    test_pruning = False
--- a/tests/models/kosmos2/test_modeling_kosmos2.py
+++ b/tests/models/kosmos2/test_modeling_kosmos2.py
@@ -257,7 +257,11 @@ class Kosmos2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
    all_model_classes = (Kosmos2Model, Kosmos2ForConditionalGeneration) if is_torch_available() else ()
    all_generative_model_classes = (Kosmos2ForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = (
-        {"feature-extraction": Kosmos2Model, "image-to-text": Kosmos2ForConditionalGeneration}
+        {
+            "feature-extraction": Kosmos2Model,
+            "image-to-text": Kosmos2ForConditionalGeneration,
+            "image-text-to-text": Kosmos2ForConditionalGeneration,
+        }
        if is_torch_available()
        else {}
    )
@@ -269,6 +273,7 @@ class Kosmos2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
    _is_composite = True

    # TODO: `image-to-text` pipeline for this model needs Processor.
+    # TODO: Tiny model needs fixing for `image-text-to-text` (latent_query_num=3 not compatible with num_image_tokens=64).
    def is_pipeline_test_to_skip(
        self,
        pipeline_test_case_name,
@@ -279,7 +284,10 @@ class Kosmos2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
        feature_extractor_name,
        processor_name,
    ):
-        return pipeline_test_case_name == "ImageToTextPipelineTests"
+        return (
+            pipeline_test_case_name == "ImageToTextPipelineTests"
+            or pipeline_test_case_name == "ImageTextToTextPipelineTests"
+        )

    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
        inputs_dict = copy.deepcopy(inputs_dict)
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -183,7 +183,11 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM

    all_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else ()
    all_generative_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else ()
-    pipeline_model_mapping = {"image-to-text": LlavaForConditionalGeneration} if is_torch_available() else {}
+    pipeline_model_mapping = (
+        {"image-to-text": LlavaForConditionalGeneration, "image-text-to-text": LlavaForConditionalGeneration}
+        if is_torch_available()
+        else {}
+    )
    test_pruning = False
    test_head_masking = False
    _is_composite = True
--- a/tests/models/llava_next/test_modeling_llava_next.py
+++ b/tests/models/llava_next/test_modeling_llava_next.py
@@ -216,6 +216,7 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes

    all_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else ()
    all_generative_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-text-to-text": LlavaNextForConditionalGeneration} if is_torch_available() else {}
    test_pruning = False
    test_head_masking = False
    _is_composite = True
--- a/tests/models/llava_onevision/test_modeling_llava_onevision.py
+++ b/tests/models/llava_onevision/test_modeling_llava_onevision.py
@@ -217,6 +217,9 @@ class LlavaOnevisionForConditionalGenerationModelTest(ModelTesterMixin, Generati

    all_model_classes = (LlavaOnevisionForConditionalGeneration,) if is_torch_available() else ()
    all_generative_model_classes = (LlavaOnevisionForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"image-text-to-text": LlavaOnevisionForConditionalGeneration} if is_torch_available() else {}
+    )
    test_pruning = False
    test_head_masking = False
    _is_composite = True
--- a/tests/models/mllama/test_modeling_mllama.py
+++ b/tests/models/mllama/test_modeling_mllama.py
@@ -264,6 +264,7 @@ class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTester

    all_model_classes = (MllamaForConditionalGeneration,) if is_torch_available() else ()
    all_generative_model_classes = (MllamaForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-text-to-text": MllamaForConditionalGeneration} if is_torch_available() else ()
    test_pruning = False
    test_head_masking = False
    test_torchscript = False
--- a/tests/models/paligemma/test_modeling_paligemma.py
+++ b/tests/models/paligemma/test_modeling_paligemma.py
@@ -183,6 +183,7 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes

    all_model_classes = (PaliGemmaForConditionalGeneration,) if is_torch_available() else ()
    all_generative_model_classes = (PaliGemmaForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-text-to-text": PaliGemmaForConditionalGeneration}
    fx_compatible = False
    test_pruning = False
    test_torchscript = False
--- a/tests/models/pix2struct/test_modeling_pix2struct.py
+++ b/tests/models/pix2struct/test_modeling_pix2struct.py
@@ -420,7 +420,11 @@ class Pix2StructModelTester:
 class Pix2StructModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (Pix2StructForConditionalGeneration,) if is_torch_available() else ()
    all_generative_model_classes = (Pix2StructForConditionalGeneration,) if is_torch_available() else {}
-    pipeline_model_mapping = {"image-to-text": Pix2StructForConditionalGeneration} if is_torch_available() else {}
+    pipeline_model_mapping = (
+        {"image-to-text": Pix2StructForConditionalGeneration, "image-text-to-text": Pix2StructForConditionalGeneration}
+        if is_torch_available()
+        else {}
+    )
    fx_compatible = False
    test_head_masking = False
    test_pruning = False
--- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
@@ -224,6 +224,7 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas

    all_model_classes = (Qwen2VLForConditionalGeneration,) if is_torch_available() else ()
    all_generative_model_classes = (Qwen2VLForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-text-to-text": Qwen2VLForConditionalGeneration}
    test_pruning = False
    test_head_masking = False

--- a/tests/models/udop/test_modeling_udop.py
+++ b/tests/models/udop/test_modeling_udop.py
@@ -275,7 +275,11 @@ class UdopModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
        else ()
    )
    all_generative_model_classes = (UdopForConditionalGeneration,) if is_torch_available() else ()
-    pipeline_model_mapping = {"feature-extraction": UdopModel} if is_torch_available() else {}
+    pipeline_model_mapping = (
+        {"feature-extraction": UdopModel, "image-text-to-text": UdopForConditionalGeneration}
+        if is_torch_available()
+        else {}
+    )
    fx_compatible = False
    test_pruning = False
    test_torchscript = False
--- a/tests/models/vipllava/test_modeling_vipllava.py
+++ b/tests/models/vipllava/test_modeling_vipllava.py
@@ -170,6 +170,7 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest

    all_model_classes = (VipLlavaForConditionalGeneration,) if is_torch_available() else ()
    all_generative_model_classes = (VipLlavaForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-text-to-text": VipLlavaForConditionalGeneration} if is_torch_available() else {}
    fx_compatible = False
    test_pruning = False
    test_resize_embeddings = True
--- a/tests/pipelines/test_pipelines_image_text_to_text.py
+++ b/tests/pipelines/test_pipelines_image_text_to_text.py
@@ -0,0 +1,260 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING, is_vision_available
+from transformers.pipelines import ImageTextToTextPipeline, pipeline
+from transformers.testing_utils import (
+    is_pipeline_test,
+    require_torch,
+    require_vision,
+    slow,
+)
+
+from .test_pipelines_common import ANY
+
+
+if is_vision_available():
+    from PIL import Image
+else:
+
+    class Image:
+        @staticmethod
+        def open(*args, **kwargs):
+            pass
+
+
+@is_pipeline_test
+@require_vision
+class ImageTextToTextPipelineTests(unittest.TestCase):
+    model_mapping = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING
+
+    def get_test_pipeline(self, model, tokenizer, processor, image_processor, torch_dtype="float32"):
+        pipe = ImageTextToTextPipeline(model=model, processor=processor, torch_dtype=torch_dtype)
+        image_token = getattr(processor.tokenizer, "image_token", "")
+        examples = [
+            {
+                "images": Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+                "text": f"{image_token}This is a ",
+            },
+            {
+                "images": "./tests/fixtures/tests_samples/COCO/000000039769.png",
+                "text": f"{image_token}Here I see a ",
+            },
+        ]
+        return pipe, examples
+
+    def run_pipeline_test(self, pipe, examples):
+        outputs = pipe(examples[0].get("images"), text=examples[0].get("text"))
+        self.assertEqual(
+            outputs,
+            [
+                {"input_text": ANY(str), "generated_text": ANY(str)},
+            ],
+        )
+
+    @require_torch
+    def test_small_model_pt_token(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        text = "<image> What this is? Assistant: This is"
+
+        outputs = pipe(image, text=text)
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": "<image> What this is? Assistant: This is",
+                    "generated_text": "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are sleeping and appear to be comfortable",
+                }
+            ],
+        )
+
+        outputs = pipe([image, image], text=[text, text])
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": "<image> What this is? Assistant: This is",
+                    "generated_text": "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are sleeping and appear to be comfortable",
+                },
+                {
+                    "input_text": "<image> What this is? Assistant: This is",
+                    "generated_text": "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are sleeping and appear to be comfortable",
+                },
+            ],
+        )
+
+    @require_torch
+    def test_consistent_batching_behaviour(self):
+        pipe = pipeline("image-text-to-text", model="microsoft/kosmos-2-patch14-224")
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        prompt = "a photo of"
+
+        outputs = pipe([image, image], text=[prompt, prompt])
+        outputs_batched = pipe([image, image], text=[prompt, prompt], batch_size=2)
+        self.assertEqual(outputs, outputs_batched)
+
+    @slow
+    @require_torch
+    def test_model_pt_chat_template(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        image_ny = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+        image_chicago = "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg"
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What’s the difference between these two images?"},
+                    {"type": "image"},
+                    {"type": "image"},
+                ],
+            }
+        ]
+        outputs = pipe([image_ny, image_chicago], text=messages)
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": "What’s the difference between these two images?"},
+                                {"type": "image"},
+                                {"type": "image"},
+                            ],
+                        }
+                    ],
+                    "generated_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": "What’s the difference between these two images?"},
+                                {"type": "image"},
+                                {"type": "image"},
+                            ],
+                        },
+                        {
+                            "role": "assistant",
+                            "content": "The first image shows a statue of the Statue of Liberty in the foreground, while the second image shows",
+                        },
+                    ],
+                }
+            ],
+        )
+
+    @slow
+    @require_torch
+    def test_model_pt_chat_template_continue_final_message(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                    },
+                    {"type": "text", "text": "Describe this image."},
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": [
+                    {"type": "text", "text": "There is a dog and"},
+                ],
+            },
+        ]
+        outputs = pipe(text=messages)
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "image",
+                                    "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                                },
+                                {"type": "text", "text": "Describe this image."},
+                            ],
+                        },
+                        {"role": "assistant", "content": [{"type": "text", "text": "There is a dog and"}]},
+                    ],
+                    "generated_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "image",
+                                    "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                                },
+                                {"type": "text", "text": "Describe this image."},
+                            ],
+                        },
+                        {
+                            "role": "assistant",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": "There is a dog and a person in the image. The dog is sitting on the sand, and the person is sitting on",
+                                }
+                            ],
+                        },
+                    ],
+                }
+            ],
+        )
+
+    @slow
+    @require_torch
+    def test_model_pt_chat_template_new_text(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                    },
+                    {"type": "text", "text": "Describe this image."},
+                ],
+            }
+        ]
+        outputs = pipe(text=messages, return_full_text=False)
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "image",
+                                    "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                                },
+                                {"type": "text", "text": "Describe this image."},
+                            ],
+                        }
+                    ],
+                    "generated_text": "In the image, a woman is sitting on the sandy beach, her legs crossed in a relaxed manner",
+                }
+            ],
+        )
--- a/tests/pipelines/test_pipelines_zero_shot_object_detection.py
+++ b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
@@ -14,7 +14,12 @@

 import unittest

-from transformers import MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING, is_vision_available, pipeline
+from transformers import (
+    MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING,
+    ZeroShotObjectDetectionPipeline,
+    is_vision_available,
+    pipeline,
+)
 from transformers.testing_utils import (
    is_pipeline_test,
    nested_simplify,
@@ -52,9 +57,11 @@ class ZeroShotObjectDetectionPipelineTests(unittest.TestCase):
        processor=None,
        torch_dtype="float32",
    ):
-        object_detector = pipeline(
-            "zero-shot-object-detection",
-            model="hf-internal-testing/tiny-random-owlvit-object-detection",
+        object_detector = ZeroShotObjectDetectionPipeline(
+            model=model,
+            processor=processor,
+            tokenizer=tokenizer,
+            image_processor=image_processor,
            torch_dtype=torch_dtype,
        )

@@ -67,7 +74,7 @@ class ZeroShotObjectDetectionPipelineTests(unittest.TestCase):
        return object_detector, examples

    def run_pipeline_test(self, object_detector, examples):
-        outputs = object_detector(examples[0], threshold=0.0)
+        outputs = object_detector(examples[0].get("image"), examples[0].get("candidate_labels"), threshold=0.0)

        n = len(outputs)
        self.assertGreater(n, 0)
--- a/tests/test_pipeline_mixin.py
+++ b/tests/test_pipeline_mixin.py
@@ -71,6 +71,7 @@ from .pipelines.test_pipelines_fill_mask import FillMaskPipelineTests
 from .pipelines.test_pipelines_image_classification import ImageClassificationPipelineTests
 from .pipelines.test_pipelines_image_feature_extraction import ImageFeatureExtractionPipelineTests
 from .pipelines.test_pipelines_image_segmentation import ImageSegmentationPipelineTests
+from .pipelines.test_pipelines_image_text_to_text import ImageTextToTextPipelineTests
 from .pipelines.test_pipelines_image_to_image import ImageToImagePipelineTests
 from .pipelines.test_pipelines_image_to_text import ImageToTextPipelineTests
 from .pipelines.test_pipelines_mask_generation import MaskGenerationPipelineTests
@@ -102,6 +103,7 @@ pipeline_test_mapping = {
    "image-classification": {"test": ImageClassificationPipelineTests},
    "image-feature-extraction": {"test": ImageFeatureExtractionPipelineTests},
    "image-segmentation": {"test": ImageSegmentationPipelineTests},
+    "image-text-to-text": {"test": ImageTextToTextPipelineTests},
    "image-to-image": {"test": ImageToImagePipelineTests},
    "image-to-text": {"test": ImageToTextPipelineTests},
    "mask-generation": {"test": MaskGenerationPipelineTests},
@@ -586,6 +588,18 @@ class PipelineTesterMixin:
    def test_pipeline_image_segmentation_fp16(self):
        self.run_task_tests(task="image-segmentation", torch_dtype="float16")

+    @is_pipeline_test
+    @require_vision
+    @require_torch
+    def test_pipeline_image_text_to_text(self):
+        self.run_task_tests(task="image-text-to-text")
+
+    @is_pipeline_test
+    @require_vision
+    @require_torch
+    def test_pipeline_image_text_to_text_fp16(self):
+        self.run_task_tests(task="image-text-to-text", torch_dtype="float16")
+
    @is_pipeline_test
    @require_vision
    def test_pipeline_image_to_text(self):
--- a/tests/utils/tiny_model_summary.json
+++ b/tests/utils/tiny_model_summary.json
@@ -2896,7 +2896,7 @@
        "model_classes": [
            "IdeficsForVisionText2Text"
        ],
-        "sha": "2c2f2e2cd6b02a77d0cdd8c3767ba9a6267dbd20"
+        "sha": "a6be81294ff7a3d44f3aef0ed18e42b97c426831"
    },
    "IdeficsModel": {
        "tokenizer_classes": [