Add image text to text pipeline (#34170)

* Standardize image-text-to-text-models-output add post_process_image_text_to_text to chameleon and cleanup Fix legacy kwarg behavior and deprecation warning add post_process_image_text_to_text to qwen2_vl and llava_onevision Add post_process_image_text_to_text to idefics3, mllama, pixtral processor * nit var name post_process_image_text_to_text udop * nit fix deprecation warnings * Add image-text-to-text pipeline * add support for image url in chat template for pipeline * Reformat to be fully compatible with chat templates * Add tests chat template * Fix imports and tests * Add pipeline tag * change logic handling of single prompt ans multiple images * add pipeline mapping to models * fix batched inference * fix tests * Add manual batching for preprocessing * Fix outputs with nested images * Add support for all common processing kwargs * Add default padding when multiple text inputs (batch size>1) * nit change version deprecation warning * Add support for text only inference * add chat_template warnings * Add pipeline tests and add copied from post process function * Fix batched pipeline tests * nit * Fix pipeline tests blip2 * remove unnecessary max_new_tokens * revert processing kosmos2 and remove unnecessary max_new_tokens * fix pipeline tests idefics * Force try loading processor if pipeline supports it * revert load_processor change * hardcode loading only processor * remove unnecessary try except * skip imagetexttotext tests for kosmos2 as tiny model causes problems * Make code clearer * Address review comments * remove preprocessing logic from pipeline * fix fuyu * add BC resize fuyu * Move post_process_image_text_to_text to ProcessorMixin * add guard in post_process * fix zero shot object detection pipeline * add support for generator input in pipeline * nit * change default image-text-to-text model to llava onevision * fix owlv2 size dict * Change legacy deprecation warning to only show when True
2024-10-31 15:48:11 -04:00
parent c443d8d536
commit 203e27059b
47 changed files with 988 additions and 33 deletions
--- a/tests/pipelines/test_pipelines_image_text_to_text.py
+++ b/tests/pipelines/test_pipelines_image_text_to_text.py
@@ -0,0 +1,260 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING, is_vision_available
+from transformers.pipelines import ImageTextToTextPipeline, pipeline
+from transformers.testing_utils import (
+    is_pipeline_test,
+    require_torch,
+    require_vision,
+    slow,
+)
+
+from .test_pipelines_common import ANY
+
+
+if is_vision_available():
+    from PIL import Image
+else:
+
+    class Image:
+        @staticmethod
+        def open(*args, **kwargs):
+            pass
+
+
+@is_pipeline_test
+@require_vision
+class ImageTextToTextPipelineTests(unittest.TestCase):
+    model_mapping = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING
+
+    def get_test_pipeline(self, model, tokenizer, processor, image_processor, torch_dtype="float32"):
+        pipe = ImageTextToTextPipeline(model=model, processor=processor, torch_dtype=torch_dtype)
+        image_token = getattr(processor.tokenizer, "image_token", "")
+        examples = [
+            {
+                "images": Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+                "text": f"{image_token}This is a ",
+            },
+            {
+                "images": "./tests/fixtures/tests_samples/COCO/000000039769.png",
+                "text": f"{image_token}Here I see a ",
+            },
+        ]
+        return pipe, examples
+
+    def run_pipeline_test(self, pipe, examples):
+        outputs = pipe(examples[0].get("images"), text=examples[0].get("text"))
+        self.assertEqual(
+            outputs,
+            [
+                {"input_text": ANY(str), "generated_text": ANY(str)},
+            ],
+        )
+
+    @require_torch
+    def test_small_model_pt_token(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        text = "<image> What this is? Assistant: This is"
+
+        outputs = pipe(image, text=text)
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": "<image> What this is? Assistant: This is",
+                    "generated_text": "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are sleeping and appear to be comfortable",
+                }
+            ],
+        )
+
+        outputs = pipe([image, image], text=[text, text])
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": "<image> What this is? Assistant: This is",
+                    "generated_text": "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are sleeping and appear to be comfortable",
+                },
+                {
+                    "input_text": "<image> What this is? Assistant: This is",
+                    "generated_text": "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are sleeping and appear to be comfortable",
+                },
+            ],
+        )
+
+    @require_torch
+    def test_consistent_batching_behaviour(self):
+        pipe = pipeline("image-text-to-text", model="microsoft/kosmos-2-patch14-224")
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        prompt = "a photo of"
+
+        outputs = pipe([image, image], text=[prompt, prompt])
+        outputs_batched = pipe([image, image], text=[prompt, prompt], batch_size=2)
+        self.assertEqual(outputs, outputs_batched)
+
+    @slow
+    @require_torch
+    def test_model_pt_chat_template(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        image_ny = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+        image_chicago = "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg"
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What’s the difference between these two images?"},
+                    {"type": "image"},
+                    {"type": "image"},
+                ],
+            }
+        ]
+        outputs = pipe([image_ny, image_chicago], text=messages)
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": "What’s the difference between these two images?"},
+                                {"type": "image"},
+                                {"type": "image"},
+                            ],
+                        }
+                    ],
+                    "generated_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": "What’s the difference between these two images?"},
+                                {"type": "image"},
+                                {"type": "image"},
+                            ],
+                        },
+                        {
+                            "role": "assistant",
+                            "content": "The first image shows a statue of the Statue of Liberty in the foreground, while the second image shows",
+                        },
+                    ],
+                }
+            ],
+        )
+
+    @slow
+    @require_torch
+    def test_model_pt_chat_template_continue_final_message(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                    },
+                    {"type": "text", "text": "Describe this image."},
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": [
+                    {"type": "text", "text": "There is a dog and"},
+                ],
+            },
+        ]
+        outputs = pipe(text=messages)
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "image",
+                                    "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                                },
+                                {"type": "text", "text": "Describe this image."},
+                            ],
+                        },
+                        {"role": "assistant", "content": [{"type": "text", "text": "There is a dog and"}]},
+                    ],
+                    "generated_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "image",
+                                    "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                                },
+                                {"type": "text", "text": "Describe this image."},
+                            ],
+                        },
+                        {
+                            "role": "assistant",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": "There is a dog and a person in the image. The dog is sitting on the sand, and the person is sitting on",
+                                }
+                            ],
+                        },
+                    ],
+                }
+            ],
+        )
+
+    @slow
+    @require_torch
+    def test_model_pt_chat_template_new_text(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                    },
+                    {"type": "text", "text": "Describe this image."},
+                ],
+            }
+        ]
+        outputs = pipe(text=messages, return_full_text=False)
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "image",
+                                    "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                                },
+                                {"type": "text", "text": "Describe this image."},
+                            ],
+                        }
+                    ],
+                    "generated_text": "In the image, a woman is sitting on the sandy beach, her legs crossed in a relaxed manner",
+                }
+            ],
+        )
--- a/tests/pipelines/test_pipelines_zero_shot_object_detection.py
+++ b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
@@ -14,7 +14,12 @@

 import unittest

-from transformers import MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING, is_vision_available, pipeline
+from transformers import (
+    MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING,
+    ZeroShotObjectDetectionPipeline,
+    is_vision_available,
+    pipeline,
+)
 from transformers.testing_utils import (
    is_pipeline_test,
    nested_simplify,
@@ -52,9 +57,11 @@ class ZeroShotObjectDetectionPipelineTests(unittest.TestCase):
        processor=None,
        torch_dtype="float32",
    ):
-        object_detector = pipeline(
-            "zero-shot-object-detection",
-            model="hf-internal-testing/tiny-random-owlvit-object-detection",
+        object_detector = ZeroShotObjectDetectionPipeline(
+            model=model,
+            processor=processor,
+            tokenizer=tokenizer,
+            image_processor=image_processor,
            torch_dtype=torch_dtype,
        )

@@ -67,7 +74,7 @@ class ZeroShotObjectDetectionPipelineTests(unittest.TestCase):
        return object_detector, examples

    def run_pipeline_test(self, object_detector, examples):
-        outputs = object_detector(examples[0], threshold=0.0)
+        outputs = object_detector(examples[0].get("image"), examples[0].get("candidate_labels"), threshold=0.0)

        n = len(outputs)
        self.assertGreater(n, 0)