Add image text to text pipeline (#34170)
* Standardize image-text-to-text-models-output add post_process_image_text_to_text to chameleon and cleanup Fix legacy kwarg behavior and deprecation warning add post_process_image_text_to_text to qwen2_vl and llava_onevision Add post_process_image_text_to_text to idefics3, mllama, pixtral processor * nit var name post_process_image_text_to_text udop * nit fix deprecation warnings * Add image-text-to-text pipeline * add support for image url in chat template for pipeline * Reformat to be fully compatible with chat templates * Add tests chat template * Fix imports and tests * Add pipeline tag * change logic handling of single prompt ans multiple images * add pipeline mapping to models * fix batched inference * fix tests * Add manual batching for preprocessing * Fix outputs with nested images * Add support for all common processing kwargs * Add default padding when multiple text inputs (batch size>1) * nit change version deprecation warning * Add support for text only inference * add chat_template warnings * Add pipeline tests and add copied from post process function * Fix batched pipeline tests * nit * Fix pipeline tests blip2 * remove unnecessary max_new_tokens * revert processing kosmos2 and remove unnecessary max_new_tokens * fix pipeline tests idefics * Force try loading processor if pipeline supports it * revert load_processor change * hardcode loading only processor * remove unnecessary try except * skip imagetexttotext tests for kosmos2 as tiny model causes problems * Make code clearer * Address review comments * remove preprocessing logic from pipeline * fix fuyu * add BC resize fuyu * Move post_process_image_text_to_text to ProcessorMixin * add guard in post_process * fix zero shot object detection pipeline * add support for generator input in pipeline * nit * change default image-text-to-text model to llava onevision * fix owlv2 size dict * Change legacy deprecation warning to only show when True
This commit is contained in:
@@ -436,6 +436,7 @@ class BlipModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
"feature-extraction": BlipModel,
|
||||
"image-to-text": BlipForConditionalGeneration,
|
||||
"visual-question-answering": BlipForQuestionAnswering,
|
||||
"image-text-to-text": BlipForConditionalGeneration,
|
||||
}
|
||||
if is_torch_available()
|
||||
else {}
|
||||
|
||||
@@ -767,6 +767,7 @@ class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMixi
|
||||
"feature-extraction": Blip2Model,
|
||||
"image-to-text": Blip2ForConditionalGeneration,
|
||||
"visual-question-answering": Blip2ForConditionalGeneration,
|
||||
"image-text-to-text": Blip2ForConditionalGeneration,
|
||||
}
|
||||
if is_torch_available()
|
||||
else {}
|
||||
|
||||
@@ -276,6 +276,7 @@ class ChameleonModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
|
||||
{
|
||||
"feature-extraction": ChameleonModel,
|
||||
"text-generation": ChameleonForConditionalGeneration,
|
||||
"image-text-to-text": ChameleonForConditionalGeneration,
|
||||
}
|
||||
if is_torch_available()
|
||||
else {}
|
||||
|
||||
@@ -265,7 +265,9 @@ class FuyuModelTester:
|
||||
@require_torch
|
||||
class FuyuModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (FuyuForCausalLM,) if is_torch_available() else ()
|
||||
pipeline_model_mapping = {"text-generation": FuyuForCausalLM} if is_torch_available() else {}
|
||||
pipeline_model_mapping = (
|
||||
{"text-generation": FuyuForCausalLM, "image-text-to-text": FuyuForCausalLM} if is_torch_available() else {}
|
||||
)
|
||||
|
||||
test_head_masking = False
|
||||
test_pruning = False
|
||||
|
||||
@@ -401,7 +401,12 @@ class GitModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
|
||||
all_model_classes = (GitModel, GitForCausalLM) if is_torch_available() else ()
|
||||
all_generative_model_classes = (GitForCausalLM,) if is_torch_available() else ()
|
||||
pipeline_model_mapping = (
|
||||
{"feature-extraction": GitModel, "image-to-text": GitForCausalLM, "text-generation": GitForCausalLM}
|
||||
{
|
||||
"feature-extraction": GitModel,
|
||||
"image-to-text": GitForCausalLM,
|
||||
"text-generation": GitForCausalLM,
|
||||
"image-text-to-text": GitForCausalLM,
|
||||
}
|
||||
if is_torch_available()
|
||||
else {}
|
||||
)
|
||||
|
||||
@@ -332,7 +332,11 @@ class IdeficsModelTester:
|
||||
@require_torch
|
||||
class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (IdeficsModel, IdeficsForVisionText2Text) if is_torch_available() else ()
|
||||
pipeline_model_mapping = {"feature-extraction": IdeficsModel} if is_torch_available() else {}
|
||||
pipeline_model_mapping = (
|
||||
{"feature-extraction": IdeficsModel, "image-text-to-text": IdeficsForVisionText2Text}
|
||||
if is_torch_available()
|
||||
else {}
|
||||
)
|
||||
test_pruning = False
|
||||
test_headmasking = False
|
||||
test_torchscript = False
|
||||
|
||||
@@ -375,6 +375,7 @@ class Idefics2ForConditionalGenerationModelTest(GenerationTesterMixin, ModelTest
|
||||
|
||||
all_model_classes = (Idefics2ForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_generative_model_classes = (Idefics2ForConditionalGeneration,) if is_torch_available() else ()
|
||||
pipeline_model_mapping = {"image-text-to-text": Idefics2ForConditionalGeneration} if is_torch_available() else ()
|
||||
fx_compatible = False
|
||||
test_pruning = False
|
||||
test_resize_embeddings = True
|
||||
|
||||
@@ -317,6 +317,7 @@ class Idefics3ForConditionalGenerationModelTest(GenerationTesterMixin, ModelTest
|
||||
|
||||
all_model_classes = (Idefics3ForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_generative_model_classes = (Idefics3ForConditionalGeneration,) if is_torch_available() else ()
|
||||
pipeline_model_mapping = {"image-text-to-text": Idefics3ForConditionalGeneration} if is_torch_available() else ()
|
||||
fx_compatible = False
|
||||
test_pruning = False
|
||||
test_resize_embeddings = True
|
||||
|
||||
@@ -455,6 +455,7 @@ class InstructBlipForConditionalGenerationDecoderOnlyModelTester:
|
||||
@require_torch
|
||||
class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else ()
|
||||
pipeline_model_mapping = {"image-text-to-text": InstructBlipForConditionalGeneration}
|
||||
fx_compatible = False
|
||||
test_head_masking = False
|
||||
test_pruning = False
|
||||
|
||||
@@ -257,7 +257,11 @@ class Kosmos2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
|
||||
all_model_classes = (Kosmos2Model, Kosmos2ForConditionalGeneration) if is_torch_available() else ()
|
||||
all_generative_model_classes = (Kosmos2ForConditionalGeneration,) if is_torch_available() else ()
|
||||
pipeline_model_mapping = (
|
||||
{"feature-extraction": Kosmos2Model, "image-to-text": Kosmos2ForConditionalGeneration}
|
||||
{
|
||||
"feature-extraction": Kosmos2Model,
|
||||
"image-to-text": Kosmos2ForConditionalGeneration,
|
||||
"image-text-to-text": Kosmos2ForConditionalGeneration,
|
||||
}
|
||||
if is_torch_available()
|
||||
else {}
|
||||
)
|
||||
@@ -269,6 +273,7 @@ class Kosmos2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
|
||||
_is_composite = True
|
||||
|
||||
# TODO: `image-to-text` pipeline for this model needs Processor.
|
||||
# TODO: Tiny model needs fixing for `image-text-to-text` (latent_query_num=3 not compatible with num_image_tokens=64).
|
||||
def is_pipeline_test_to_skip(
|
||||
self,
|
||||
pipeline_test_case_name,
|
||||
@@ -279,7 +284,10 @@ class Kosmos2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
|
||||
feature_extractor_name,
|
||||
processor_name,
|
||||
):
|
||||
return pipeline_test_case_name == "ImageToTextPipelineTests"
|
||||
return (
|
||||
pipeline_test_case_name == "ImageToTextPipelineTests"
|
||||
or pipeline_test_case_name == "ImageTextToTextPipelineTests"
|
||||
)
|
||||
|
||||
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
|
||||
inputs_dict = copy.deepcopy(inputs_dict)
|
||||
|
||||
@@ -183,7 +183,11 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM
|
||||
|
||||
all_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_generative_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else ()
|
||||
pipeline_model_mapping = {"image-to-text": LlavaForConditionalGeneration} if is_torch_available() else {}
|
||||
pipeline_model_mapping = (
|
||||
{"image-to-text": LlavaForConditionalGeneration, "image-text-to-text": LlavaForConditionalGeneration}
|
||||
if is_torch_available()
|
||||
else {}
|
||||
)
|
||||
test_pruning = False
|
||||
test_head_masking = False
|
||||
_is_composite = True
|
||||
|
||||
@@ -216,6 +216,7 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
|
||||
|
||||
all_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_generative_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else ()
|
||||
pipeline_model_mapping = {"image-text-to-text": LlavaNextForConditionalGeneration} if is_torch_available() else {}
|
||||
test_pruning = False
|
||||
test_head_masking = False
|
||||
_is_composite = True
|
||||
|
||||
@@ -217,6 +217,9 @@ class LlavaOnevisionForConditionalGenerationModelTest(ModelTesterMixin, Generati
|
||||
|
||||
all_model_classes = (LlavaOnevisionForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_generative_model_classes = (LlavaOnevisionForConditionalGeneration,) if is_torch_available() else ()
|
||||
pipeline_model_mapping = (
|
||||
{"image-text-to-text": LlavaOnevisionForConditionalGeneration} if is_torch_available() else {}
|
||||
)
|
||||
test_pruning = False
|
||||
test_head_masking = False
|
||||
_is_composite = True
|
||||
|
||||
@@ -264,6 +264,7 @@ class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTester
|
||||
|
||||
all_model_classes = (MllamaForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_generative_model_classes = (MllamaForConditionalGeneration,) if is_torch_available() else ()
|
||||
pipeline_model_mapping = {"image-text-to-text": MllamaForConditionalGeneration} if is_torch_available() else ()
|
||||
test_pruning = False
|
||||
test_head_masking = False
|
||||
test_torchscript = False
|
||||
|
||||
@@ -183,6 +183,7 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
|
||||
|
||||
all_model_classes = (PaliGemmaForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_generative_model_classes = (PaliGemmaForConditionalGeneration,) if is_torch_available() else ()
|
||||
pipeline_model_mapping = {"image-text-to-text": PaliGemmaForConditionalGeneration}
|
||||
fx_compatible = False
|
||||
test_pruning = False
|
||||
test_torchscript = False
|
||||
|
||||
@@ -420,7 +420,11 @@ class Pix2StructModelTester:
|
||||
class Pix2StructModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (Pix2StructForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_generative_model_classes = (Pix2StructForConditionalGeneration,) if is_torch_available() else {}
|
||||
pipeline_model_mapping = {"image-to-text": Pix2StructForConditionalGeneration} if is_torch_available() else {}
|
||||
pipeline_model_mapping = (
|
||||
{"image-to-text": Pix2StructForConditionalGeneration, "image-text-to-text": Pix2StructForConditionalGeneration}
|
||||
if is_torch_available()
|
||||
else {}
|
||||
)
|
||||
fx_compatible = False
|
||||
test_head_masking = False
|
||||
test_pruning = False
|
||||
|
||||
@@ -224,6 +224,7 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
|
||||
|
||||
all_model_classes = (Qwen2VLForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_generative_model_classes = (Qwen2VLForConditionalGeneration,) if is_torch_available() else ()
|
||||
pipeline_model_mapping = {"image-text-to-text": Qwen2VLForConditionalGeneration}
|
||||
test_pruning = False
|
||||
test_head_masking = False
|
||||
|
||||
|
||||
@@ -275,7 +275,11 @@ class UdopModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
else ()
|
||||
)
|
||||
all_generative_model_classes = (UdopForConditionalGeneration,) if is_torch_available() else ()
|
||||
pipeline_model_mapping = {"feature-extraction": UdopModel} if is_torch_available() else {}
|
||||
pipeline_model_mapping = (
|
||||
{"feature-extraction": UdopModel, "image-text-to-text": UdopForConditionalGeneration}
|
||||
if is_torch_available()
|
||||
else {}
|
||||
)
|
||||
fx_compatible = False
|
||||
test_pruning = False
|
||||
test_torchscript = False
|
||||
|
||||
@@ -170,6 +170,7 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest
|
||||
|
||||
all_model_classes = (VipLlavaForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_generative_model_classes = (VipLlavaForConditionalGeneration,) if is_torch_available() else ()
|
||||
pipeline_model_mapping = {"image-text-to-text": VipLlavaForConditionalGeneration} if is_torch_available() else {}
|
||||
fx_compatible = False
|
||||
test_pruning = False
|
||||
test_resize_embeddings = True
|
||||
|
||||
Reference in New Issue
Block a user