Add image text to text pipeline (#34170)

* Standardize image-text-to-text-models-output

add post_process_image_text_to_text to chameleon and cleanup

Fix legacy kwarg behavior and deprecation warning

add post_process_image_text_to_text to qwen2_vl and llava_onevision

Add post_process_image_text_to_text to idefics3, mllama, pixtral processor

* nit var name post_process_image_text_to_text udop

* nit fix deprecation warnings

* Add image-text-to-text pipeline

* add support for image url in chat template for pipeline

* Reformat to be fully compatible with chat templates

* Add tests chat template

* Fix imports and tests

* Add pipeline tag

* change logic handling of single prompt ans multiple images

* add pipeline mapping to models

* fix batched inference

* fix tests

* Add manual batching for preprocessing

* Fix outputs with nested images

* Add support for all common processing kwargs

* Add default padding when multiple text inputs (batch size>1)

* nit change version deprecation warning

* Add support for text only inference

* add chat_template warnings

* Add pipeline tests and add copied from post process function

* Fix batched pipeline tests

* nit

* Fix pipeline tests blip2

* remove unnecessary max_new_tokens

* revert processing kosmos2 and remove unnecessary max_new_tokens

* fix pipeline tests idefics

* Force try loading processor if pipeline supports it

* revert load_processor change

* hardcode loading only processor

* remove unnecessary try except

* skip imagetexttotext tests for kosmos2 as tiny model causes problems

* Make code clearer

* Address review comments

* remove preprocessing logic from pipeline

* fix fuyu

* add BC resize fuyu

* Move post_process_image_text_to_text to ProcessorMixin

* add guard in post_process

* fix zero shot object detection pipeline

* add support for generator input in pipeline

* nit

* change default image-text-to-text model to llava onevision

* fix owlv2 size dict

* Change legacy deprecation warning to only show when True
This commit is contained in:
Yoni Gozlan
2024-10-31 15:48:11 -04:00
committed by GitHub
parent c443d8d536
commit 203e27059b
47 changed files with 988 additions and 33 deletions

View File

@@ -436,6 +436,7 @@ class BlipModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
"feature-extraction": BlipModel,
"image-to-text": BlipForConditionalGeneration,
"visual-question-answering": BlipForQuestionAnswering,
"image-text-to-text": BlipForConditionalGeneration,
}
if is_torch_available()
else {}

View File

@@ -767,6 +767,7 @@ class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMixi
"feature-extraction": Blip2Model,
"image-to-text": Blip2ForConditionalGeneration,
"visual-question-answering": Blip2ForConditionalGeneration,
"image-text-to-text": Blip2ForConditionalGeneration,
}
if is_torch_available()
else {}

View File

@@ -276,6 +276,7 @@ class ChameleonModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
{
"feature-extraction": ChameleonModel,
"text-generation": ChameleonForConditionalGeneration,
"image-text-to-text": ChameleonForConditionalGeneration,
}
if is_torch_available()
else {}

View File

@@ -265,7 +265,9 @@ class FuyuModelTester:
@require_torch
class FuyuModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (FuyuForCausalLM,) if is_torch_available() else ()
pipeline_model_mapping = {"text-generation": FuyuForCausalLM} if is_torch_available() else {}
pipeline_model_mapping = (
{"text-generation": FuyuForCausalLM, "image-text-to-text": FuyuForCausalLM} if is_torch_available() else {}
)
test_head_masking = False
test_pruning = False

View File

@@ -401,7 +401,12 @@ class GitModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
all_model_classes = (GitModel, GitForCausalLM) if is_torch_available() else ()
all_generative_model_classes = (GitForCausalLM,) if is_torch_available() else ()
pipeline_model_mapping = (
{"feature-extraction": GitModel, "image-to-text": GitForCausalLM, "text-generation": GitForCausalLM}
{
"feature-extraction": GitModel,
"image-to-text": GitForCausalLM,
"text-generation": GitForCausalLM,
"image-text-to-text": GitForCausalLM,
}
if is_torch_available()
else {}
)

View File

@@ -332,7 +332,11 @@ class IdeficsModelTester:
@require_torch
class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (IdeficsModel, IdeficsForVisionText2Text) if is_torch_available() else ()
pipeline_model_mapping = {"feature-extraction": IdeficsModel} if is_torch_available() else {}
pipeline_model_mapping = (
{"feature-extraction": IdeficsModel, "image-text-to-text": IdeficsForVisionText2Text}
if is_torch_available()
else {}
)
test_pruning = False
test_headmasking = False
test_torchscript = False

View File

@@ -375,6 +375,7 @@ class Idefics2ForConditionalGenerationModelTest(GenerationTesterMixin, ModelTest
all_model_classes = (Idefics2ForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (Idefics2ForConditionalGeneration,) if is_torch_available() else ()
pipeline_model_mapping = {"image-text-to-text": Idefics2ForConditionalGeneration} if is_torch_available() else ()
fx_compatible = False
test_pruning = False
test_resize_embeddings = True

View File

@@ -317,6 +317,7 @@ class Idefics3ForConditionalGenerationModelTest(GenerationTesterMixin, ModelTest
all_model_classes = (Idefics3ForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (Idefics3ForConditionalGeneration,) if is_torch_available() else ()
pipeline_model_mapping = {"image-text-to-text": Idefics3ForConditionalGeneration} if is_torch_available() else ()
fx_compatible = False
test_pruning = False
test_resize_embeddings = True

View File

@@ -455,6 +455,7 @@ class InstructBlipForConditionalGenerationDecoderOnlyModelTester:
@require_torch
class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
all_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else ()
pipeline_model_mapping = {"image-text-to-text": InstructBlipForConditionalGeneration}
fx_compatible = False
test_head_masking = False
test_pruning = False

View File

@@ -257,7 +257,11 @@ class Kosmos2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
all_model_classes = (Kosmos2Model, Kosmos2ForConditionalGeneration) if is_torch_available() else ()
all_generative_model_classes = (Kosmos2ForConditionalGeneration,) if is_torch_available() else ()
pipeline_model_mapping = (
{"feature-extraction": Kosmos2Model, "image-to-text": Kosmos2ForConditionalGeneration}
{
"feature-extraction": Kosmos2Model,
"image-to-text": Kosmos2ForConditionalGeneration,
"image-text-to-text": Kosmos2ForConditionalGeneration,
}
if is_torch_available()
else {}
)
@@ -269,6 +273,7 @@ class Kosmos2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
_is_composite = True
# TODO: `image-to-text` pipeline for this model needs Processor.
# TODO: Tiny model needs fixing for `image-text-to-text` (latent_query_num=3 not compatible with num_image_tokens=64).
def is_pipeline_test_to_skip(
self,
pipeline_test_case_name,
@@ -279,7 +284,10 @@ class Kosmos2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
feature_extractor_name,
processor_name,
):
return pipeline_test_case_name == "ImageToTextPipelineTests"
return (
pipeline_test_case_name == "ImageToTextPipelineTests"
or pipeline_test_case_name == "ImageTextToTextPipelineTests"
)
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
inputs_dict = copy.deepcopy(inputs_dict)

View File

@@ -183,7 +183,11 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM
all_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else ()
pipeline_model_mapping = {"image-to-text": LlavaForConditionalGeneration} if is_torch_available() else {}
pipeline_model_mapping = (
{"image-to-text": LlavaForConditionalGeneration, "image-text-to-text": LlavaForConditionalGeneration}
if is_torch_available()
else {}
)
test_pruning = False
test_head_masking = False
_is_composite = True

View File

@@ -216,6 +216,7 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
all_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else ()
pipeline_model_mapping = {"image-text-to-text": LlavaNextForConditionalGeneration} if is_torch_available() else {}
test_pruning = False
test_head_masking = False
_is_composite = True

View File

@@ -217,6 +217,9 @@ class LlavaOnevisionForConditionalGenerationModelTest(ModelTesterMixin, Generati
all_model_classes = (LlavaOnevisionForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (LlavaOnevisionForConditionalGeneration,) if is_torch_available() else ()
pipeline_model_mapping = (
{"image-text-to-text": LlavaOnevisionForConditionalGeneration} if is_torch_available() else {}
)
test_pruning = False
test_head_masking = False
_is_composite = True

View File

@@ -264,6 +264,7 @@ class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTester
all_model_classes = (MllamaForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (MllamaForConditionalGeneration,) if is_torch_available() else ()
pipeline_model_mapping = {"image-text-to-text": MllamaForConditionalGeneration} if is_torch_available() else ()
test_pruning = False
test_head_masking = False
test_torchscript = False

View File

@@ -183,6 +183,7 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
all_model_classes = (PaliGemmaForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (PaliGemmaForConditionalGeneration,) if is_torch_available() else ()
pipeline_model_mapping = {"image-text-to-text": PaliGemmaForConditionalGeneration}
fx_compatible = False
test_pruning = False
test_torchscript = False

View File

@@ -420,7 +420,11 @@ class Pix2StructModelTester:
class Pix2StructModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (Pix2StructForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (Pix2StructForConditionalGeneration,) if is_torch_available() else {}
pipeline_model_mapping = {"image-to-text": Pix2StructForConditionalGeneration} if is_torch_available() else {}
pipeline_model_mapping = (
{"image-to-text": Pix2StructForConditionalGeneration, "image-text-to-text": Pix2StructForConditionalGeneration}
if is_torch_available()
else {}
)
fx_compatible = False
test_head_masking = False
test_pruning = False

View File

@@ -224,6 +224,7 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
all_model_classes = (Qwen2VLForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (Qwen2VLForConditionalGeneration,) if is_torch_available() else ()
pipeline_model_mapping = {"image-text-to-text": Qwen2VLForConditionalGeneration}
test_pruning = False
test_head_masking = False

View File

@@ -275,7 +275,11 @@ class UdopModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
else ()
)
all_generative_model_classes = (UdopForConditionalGeneration,) if is_torch_available() else ()
pipeline_model_mapping = {"feature-extraction": UdopModel} if is_torch_available() else {}
pipeline_model_mapping = (
{"feature-extraction": UdopModel, "image-text-to-text": UdopForConditionalGeneration}
if is_torch_available()
else {}
)
fx_compatible = False
test_pruning = False
test_torchscript = False

View File

@@ -170,6 +170,7 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest
all_model_classes = (VipLlavaForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (VipLlavaForConditionalGeneration,) if is_torch_available() else ()
pipeline_model_mapping = {"image-text-to-text": VipLlavaForConditionalGeneration} if is_torch_available() else {}
fx_compatible = False
test_pruning = False
test_resize_embeddings = True

View File

@@ -0,0 +1,260 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING, is_vision_available
from transformers.pipelines import ImageTextToTextPipeline, pipeline
from transformers.testing_utils import (
is_pipeline_test,
require_torch,
require_vision,
slow,
)
from .test_pipelines_common import ANY
if is_vision_available():
from PIL import Image
else:
class Image:
@staticmethod
def open(*args, **kwargs):
pass
@is_pipeline_test
@require_vision
class ImageTextToTextPipelineTests(unittest.TestCase):
model_mapping = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING
def get_test_pipeline(self, model, tokenizer, processor, image_processor, torch_dtype="float32"):
pipe = ImageTextToTextPipeline(model=model, processor=processor, torch_dtype=torch_dtype)
image_token = getattr(processor.tokenizer, "image_token", "")
examples = [
{
"images": Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
"text": f"{image_token}This is a ",
},
{
"images": "./tests/fixtures/tests_samples/COCO/000000039769.png",
"text": f"{image_token}Here I see a ",
},
]
return pipe, examples
def run_pipeline_test(self, pipe, examples):
outputs = pipe(examples[0].get("images"), text=examples[0].get("text"))
self.assertEqual(
outputs,
[
{"input_text": ANY(str), "generated_text": ANY(str)},
],
)
@require_torch
def test_small_model_pt_token(self):
pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
text = "<image> What this is? Assistant: This is"
outputs = pipe(image, text=text)
self.assertEqual(
outputs,
[
{
"input_text": "<image> What this is? Assistant: This is",
"generated_text": "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are sleeping and appear to be comfortable",
}
],
)
outputs = pipe([image, image], text=[text, text])
self.assertEqual(
outputs,
[
{
"input_text": "<image> What this is? Assistant: This is",
"generated_text": "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are sleeping and appear to be comfortable",
},
{
"input_text": "<image> What this is? Assistant: This is",
"generated_text": "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are sleeping and appear to be comfortable",
},
],
)
@require_torch
def test_consistent_batching_behaviour(self):
pipe = pipeline("image-text-to-text", model="microsoft/kosmos-2-patch14-224")
image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
prompt = "a photo of"
outputs = pipe([image, image], text=[prompt, prompt])
outputs_batched = pipe([image, image], text=[prompt, prompt], batch_size=2)
self.assertEqual(outputs, outputs_batched)
@slow
@require_torch
def test_model_pt_chat_template(self):
pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
image_ny = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
image_chicago = "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Whats the difference between these two images?"},
{"type": "image"},
{"type": "image"},
],
}
]
outputs = pipe([image_ny, image_chicago], text=messages)
self.assertEqual(
outputs,
[
{
"input_text": [
{
"role": "user",
"content": [
{"type": "text", "text": "Whats the difference between these two images?"},
{"type": "image"},
{"type": "image"},
],
}
],
"generated_text": [
{
"role": "user",
"content": [
{"type": "text", "text": "Whats the difference between these two images?"},
{"type": "image"},
{"type": "image"},
],
},
{
"role": "assistant",
"content": "The first image shows a statue of the Statue of Liberty in the foreground, while the second image shows",
},
],
}
],
)
@slow
@require_torch
def test_model_pt_chat_template_continue_final_message(self):
pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
},
{"type": "text", "text": "Describe this image."},
],
},
{
"role": "assistant",
"content": [
{"type": "text", "text": "There is a dog and"},
],
},
]
outputs = pipe(text=messages)
self.assertEqual(
outputs,
[
{
"input_text": [
{
"role": "user",
"content": [
{
"type": "image",
"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
},
{"type": "text", "text": "Describe this image."},
],
},
{"role": "assistant", "content": [{"type": "text", "text": "There is a dog and"}]},
],
"generated_text": [
{
"role": "user",
"content": [
{
"type": "image",
"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
},
{"type": "text", "text": "Describe this image."},
],
},
{
"role": "assistant",
"content": [
{
"type": "text",
"text": "There is a dog and a person in the image. The dog is sitting on the sand, and the person is sitting on",
}
],
},
],
}
],
)
@slow
@require_torch
def test_model_pt_chat_template_new_text(self):
pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
},
{"type": "text", "text": "Describe this image."},
],
}
]
outputs = pipe(text=messages, return_full_text=False)
self.assertEqual(
outputs,
[
{
"input_text": [
{
"role": "user",
"content": [
{
"type": "image",
"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
},
{"type": "text", "text": "Describe this image."},
],
}
],
"generated_text": "In the image, a woman is sitting on the sandy beach, her legs crossed in a relaxed manner",
}
],
)

View File

@@ -14,7 +14,12 @@
import unittest
from transformers import MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING, is_vision_available, pipeline
from transformers import (
MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING,
ZeroShotObjectDetectionPipeline,
is_vision_available,
pipeline,
)
from transformers.testing_utils import (
is_pipeline_test,
nested_simplify,
@@ -52,9 +57,11 @@ class ZeroShotObjectDetectionPipelineTests(unittest.TestCase):
processor=None,
torch_dtype="float32",
):
object_detector = pipeline(
"zero-shot-object-detection",
model="hf-internal-testing/tiny-random-owlvit-object-detection",
object_detector = ZeroShotObjectDetectionPipeline(
model=model,
processor=processor,
tokenizer=tokenizer,
image_processor=image_processor,
torch_dtype=torch_dtype,
)
@@ -67,7 +74,7 @@ class ZeroShotObjectDetectionPipelineTests(unittest.TestCase):
return object_detector, examples
def run_pipeline_test(self, object_detector, examples):
outputs = object_detector(examples[0], threshold=0.0)
outputs = object_detector(examples[0].get("image"), examples[0].get("candidate_labels"), threshold=0.0)
n = len(outputs)
self.assertGreater(n, 0)

View File

@@ -71,6 +71,7 @@ from .pipelines.test_pipelines_fill_mask import FillMaskPipelineTests
from .pipelines.test_pipelines_image_classification import ImageClassificationPipelineTests
from .pipelines.test_pipelines_image_feature_extraction import ImageFeatureExtractionPipelineTests
from .pipelines.test_pipelines_image_segmentation import ImageSegmentationPipelineTests
from .pipelines.test_pipelines_image_text_to_text import ImageTextToTextPipelineTests
from .pipelines.test_pipelines_image_to_image import ImageToImagePipelineTests
from .pipelines.test_pipelines_image_to_text import ImageToTextPipelineTests
from .pipelines.test_pipelines_mask_generation import MaskGenerationPipelineTests
@@ -102,6 +103,7 @@ pipeline_test_mapping = {
"image-classification": {"test": ImageClassificationPipelineTests},
"image-feature-extraction": {"test": ImageFeatureExtractionPipelineTests},
"image-segmentation": {"test": ImageSegmentationPipelineTests},
"image-text-to-text": {"test": ImageTextToTextPipelineTests},
"image-to-image": {"test": ImageToImagePipelineTests},
"image-to-text": {"test": ImageToTextPipelineTests},
"mask-generation": {"test": MaskGenerationPipelineTests},
@@ -586,6 +588,18 @@ class PipelineTesterMixin:
def test_pipeline_image_segmentation_fp16(self):
self.run_task_tests(task="image-segmentation", torch_dtype="float16")
@is_pipeline_test
@require_vision
@require_torch
def test_pipeline_image_text_to_text(self):
self.run_task_tests(task="image-text-to-text")
@is_pipeline_test
@require_vision
@require_torch
def test_pipeline_image_text_to_text_fp16(self):
self.run_task_tests(task="image-text-to-text", torch_dtype="float16")
@is_pipeline_test
@require_vision
def test_pipeline_image_to_text(self):

View File

@@ -2896,7 +2896,7 @@
"model_classes": [
"IdeficsForVisionText2Text"
],
"sha": "2c2f2e2cd6b02a77d0cdd8c3767ba9a6267dbd20"
"sha": "a6be81294ff7a3d44f3aef0ed18e42b97c426831"
},
"IdeficsModel": {
"tokenizer_classes": [