Add auto model for image-text-to-text (#32472)

* Add Auto model for image-text-to-text * Remove donut from processing auto, add chameleon ti image text to text models * add qwen2_vl and llava_onevision * add pixtral to auto model for image-text-to-text * add mllama and idefics3 * remove models in IGNORE_NON_AUTO_CONFIGURED * add AutoModelForImageTextToText to tests and doc
2024-10-08 14:26:43 +02:00
parent 0dbc7090ba
commit e2001c3413
11 changed files with 89 additions and 28 deletions
--- a/docs/source/en/model_doc/auto.md
+++ b/docs/source/en/model_doc/auto.md
@@ -381,3 +381,7 @@ The following auto classes are available for the following multimodal tasks.
 ### FlaxAutoModelForVision2Seq

 [[autodoc]] FlaxAutoModelForVision2Seq
+
+### AutoModelForImageTextToText
+
+[[autodoc]] AutoModelForImageTextToText
--- a/docs/source/en/model_doc/llava_next.md
+++ b/docs/source/en/model_doc/llava_next.md
@@ -166,10 +166,10 @@ LLaVa-Next can perform inference with multiple images as input, where images eit
 import requests
 from PIL import Image
 import torch
-from transformers import AutoProcessor, LlavaNextForConditionalGeneration
+from transformers import AutoProcessor, AutoModelForImageTextToText

 # Load the model in half-precision
-model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, device_map="auto")
+model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, device_map="auto")
 processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")

 # Get three different images
@@ -246,7 +246,7 @@ We value your feedback to help identify bugs before the full release! Check out
 Simply change the snippet above with:

 ```python
-from transformers import LlavaNextForConditionalGeneration, BitsAndBytesConfig
+from transformers import AutoModelForImageTextToText, BitsAndBytesConfig

 # specify how to quantize the model
 quantization_config = BitsAndBytesConfig(
@@ -255,7 +255,7 @@ quantization_config = BitsAndBytesConfig(
    bnb_4bit_compute_dtype=torch.float16,
 )

-model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", quantization_config=quantization_config, device_map="auto")
+model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", quantization_config=quantization_config, device_map="auto")
 ```

 ### Use Flash-Attention 2 to further speed-up generation
@@ -263,9 +263,9 @@ model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-m
 First make sure to install flash-attn. Refer to the [original repository of Flash Attention](https://github.com/Dao-AILab/flash-attention) regarding that package installation. Simply change the snippet above with:

 ```python
-from transformers import LlavaNextForConditionalGeneration
+from transformers import AutoModelForImageTextToText

-model = LlavaNextForConditionalGeneration.from_pretrained(
+model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
--- a/docs/source/en/tasks/image_text_to_text.md
+++ b/docs/source/en/tasks/image_text_to_text.md
@@ -38,11 +38,11 @@ pip install -q transformers accelerate flash_attn
 Let's initialize the model and the processor.

 ```python
-from transformers import AutoProcessor, Idefics2ForConditionalGeneration
+from transformers import AutoProcessor, AutoModelForImageTextToText
 import torch

 device = torch.device("cuda")
-model = Idefics2ForConditionalGeneration.from_pretrained(
+model = AutoModelForImageTextToText.from_pretrained(
    "HuggingFaceM4/idefics2-8b",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
@@ -215,11 +215,13 @@ pip install -U quanto bitsandbytes
 To quantize a model during loading, we need to first create [`QuantoConfig`]. Then load the model as usual, but pass `quantization_config` during model initialization.

 ```python
-from transformers import Idefics2ForConditionalGeneration, AutoTokenizer, QuantoConfig
+from transformers import AutoModelForImageTextToText, QuantoConfig

 model_id = "HuggingFaceM4/idefics2-8b"
 quantization_config = QuantoConfig(weights="int8")
-quantized_model = Idefics2ForConditionalGeneration.from_pretrained(model_id, device_map="cuda", quantization_config=quantization_config)
+quantized_model = AutoModelForImageTextToText.from_pretrained(
+    model_id, device_map="cuda", quantization_config=quantization_config
+)
 ```

 And that's it, we can use the model the same way with no changes.
--- a/docs/source/ja/model_doc/auto.md
+++ b/docs/source/ja/model_doc/auto.md
@@ -368,3 +368,7 @@ AutoModel.register(NewModelConfig, NewModel)
 ### FlaxAutoModelForVision2Seq

 [[autodoc]] FlaxAutoModelForVision2Seq
+
+### AutoModelForImageTextToText
+
+[[autodoc]] AutoModelForImageTextToText
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -1407,6 +1407,7 @@ else:
            "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
            "MODEL_FOR_IMAGE_MAPPING",
            "MODEL_FOR_IMAGE_SEGMENTATION_MAPPING",
+            "MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING",
            "MODEL_FOR_IMAGE_TO_IMAGE_MAPPING",
            "MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING",
            "MODEL_FOR_KEYPOINT_DETECTION_MAPPING",
@@ -1448,6 +1449,7 @@ else:
            "AutoModelForDocumentQuestionAnswering",
            "AutoModelForImageClassification",
            "AutoModelForImageSegmentation",
+            "AutoModelForImageTextToText",
            "AutoModelForImageToImage",
            "AutoModelForInstanceSegmentation",
            "AutoModelForKeypointDetection",
@@ -6272,6 +6274,7 @@ if TYPE_CHECKING:
            MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
            MODEL_FOR_IMAGE_MAPPING,
            MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
+            MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING,
            MODEL_FOR_IMAGE_TO_IMAGE_MAPPING,
            MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
            MODEL_FOR_KEYPOINT_DETECTION_MAPPING,
@@ -6313,6 +6316,7 @@ if TYPE_CHECKING:
            AutoModelForDocumentQuestionAnswering,
            AutoModelForImageClassification,
            AutoModelForImageSegmentation,
+            AutoModelForImageTextToText,
            AutoModelForImageToImage,
            AutoModelForInstanceSegmentation,
            AutoModelForKeypointDetection,
--- a/src/transformers/models/auto/init.py
+++ b/src/transformers/models/auto/init.py
@@ -74,6 +74,7 @@ else:
        "MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
        "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
        "MODEL_FOR_VISION_2_SEQ_MAPPING",
+        "MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING",
        "MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
        "MODEL_MAPPING",
        "MODEL_WITH_LM_HEAD_MAPPING",
@@ -119,6 +120,7 @@ else:
        "AutoModelWithLMHead",
        "AutoModelForZeroShotImageClassification",
        "AutoModelForZeroShotObjectDetection",
+        "AutoModelForImageTextToText",
    ]

 try:
@@ -238,6 +240,7 @@ if TYPE_CHECKING:
            MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
            MODEL_FOR_IMAGE_MAPPING,
            MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
+            MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING,
            MODEL_FOR_IMAGE_TO_IMAGE_MAPPING,
            MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
            MODEL_FOR_KEYPOINT_DETECTION_MAPPING,
@@ -279,6 +282,7 @@ if TYPE_CHECKING:
            AutoModelForDocumentQuestionAnswering,
            AutoModelForImageClassification,
            AutoModelForImageSegmentation,
+            AutoModelForImageTextToText,
            AutoModelForImageToImage,
            AutoModelForInstanceSegmentation,
            AutoModelForKeypointDetection,
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -757,6 +757,32 @@ MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict(
    ]
 )

+MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict(
+    [
+        ("blip", "BlipForConditionalGeneration"),
+        ("blip-2", "Blip2ForConditionalGeneration"),
+        ("chameleon", "ChameleonForConditionalGeneration"),
+        ("fuyu", "FuyuForCausalLM"),
+        ("git", "GitForCausalLM"),
+        ("idefics", "IdeficsForVisionText2Text"),
+        ("idefics2", "Idefics2ForConditionalGeneration"),
+        ("idefics3", "Idefics3ForConditionalGeneration"),
+        ("instructblip", "InstructBlipForConditionalGeneration"),
+        ("kosmos-2", "Kosmos2ForConditionalGeneration"),
+        ("llava", "LlavaForConditionalGeneration"),
+        ("llava_next", "LlavaNextForConditionalGeneration"),
+        ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),
+        ("mllama", "MllamaForConditionalGeneration"),
+        ("paligemma", "PaliGemmaForConditionalGeneration"),
+        ("pix2struct", "Pix2StructForConditionalGeneration"),
+        ("pixtral", "LlavaForConditionalGeneration"),
+        ("qwen2_vl", "Qwen2VLForConditionalGeneration"),
+        ("udop", "UdopForConditionalGeneration"),
+        ("vipllava", "VipLlavaForConditionalGeneration"),
+        ("vision-encoder-decoder", "VisionEncoderDecoderModel"),
+    ]
+)
+
 MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
    [
        # Model for Masked LM mapping
@@ -1419,6 +1445,9 @@ MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING = _LazyAutoMapping(
    CONFIG_MAPPING_NAMES, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES
 )
 MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES)
+MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
+)
 MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
    CONFIG_MAPPING_NAMES, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
 )
@@ -1713,6 +1742,13 @@ class AutoModelForVision2Seq(_BaseAutoModelClass):
 AutoModelForVision2Seq = auto_class_update(AutoModelForVision2Seq, head_doc="vision-to-text modeling")


+class AutoModelForImageTextToText(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING
+
+
+AutoModelForImageTextToText = auto_class_update(AutoModelForImageTextToText, head_doc="image-text-to-text modeling")
+
+
 class AutoModelForAudioClassification(_BaseAutoModelClass):
    _model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING

--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -99,6 +99,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
        ("trocr", "TrOCRProcessor"),
        ("tvlt", "TvltProcessor"),
        ("tvp", "TvpProcessor"),
+        ("udop", "UdopProcessor"),
        ("unispeech", "Wav2Vec2Processor"),
        ("unispeech-sat", "Wav2Vec2Processor"),
        ("video_llava", "VideoLlavaProcessor"),
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -707,6 +707,9 @@ MODEL_FOR_IMAGE_MAPPING = None
 MODEL_FOR_IMAGE_SEGMENTATION_MAPPING = None


+MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING = None
+
+
 MODEL_FOR_IMAGE_TO_IMAGE_MAPPING = None


@@ -874,6 +877,13 @@ class AutoModelForImageSegmentation(metaclass=DummyObject):
        requires_backends(self, ["torch"])


+class AutoModelForImageTextToText(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class AutoModelForImageToImage(metaclass=DummyObject):
    _backends = ["torch"]

--- a/tests/models/kosmos2/test_modeling_kosmos2.py
+++ b/tests/models/kosmos2/test_modeling_kosmos2.py
@@ -23,7 +23,7 @@ import unittest
 import numpy as np
 import requests

-from transformers import AutoModelForVision2Seq, AutoProcessor, Kosmos2Config
+from transformers import AutoModelForImageTextToText, AutoProcessor, Kosmos2Config
 from transformers.models.kosmos2.configuration_kosmos2 import Kosmos2TextConfig, Kosmos2VisionConfig
 from transformers.testing_utils import IS_ROCM_SYSTEM, require_torch, require_vision, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
@@ -551,7 +551,7 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase):
        image.save("new_image.jpg")
        image = Image.open("new_image.jpg")

-        model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
+        model = AutoModelForImageTextToText.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
        processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

        prompt = "<grounding>An image of"
@@ -697,7 +697,7 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase):
        image.save("new_image.jpg")
        image = Image.open("new_image.jpg")

-        model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
+        model = AutoModelForImageTextToText.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)

        prompt = ["<grounding>Describe this image in detail:", "<grounding>An image of"]

--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -170,7 +170,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
    "ClapTextModelWithProjection",
    "ClapAudioModel",
    "ClapAudioModelWithProjection",
-    "Blip2ForConditionalGeneration",
    "Blip2TextModelWithProjection",
    "Blip2VisionModelWithProjection",
    "Blip2QFormerModel",
@@ -181,7 +180,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
    "GitVisionModel",
    "GraphormerModel",
    "GraphormerForGraphClassification",
-    "BlipForConditionalGeneration",
    "BlipForImageTextRetrieval",
    "BlipForQuestionAnswering",
    "BlipVisionModel",
@@ -245,7 +243,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
    "DetrForSegmentation",
    "Pix2StructVisionModel",
    "Pix2StructTextModel",
-    "Pix2StructForConditionalGeneration",
    "ConditionalDetrForSegmentation",
    "DPRReader",
    "FlaubertForQuestionAnswering",
@@ -322,7 +319,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
    "SeamlessM4TCodeHifiGan",
    "SeamlessM4TForSpeechToSpeech",  # no auto class for speech-to-speech
    "TvpForVideoGrounding",
-    "UdopForConditionalGeneration",
    "SeamlessM4Tv2NARTextToUnitModel",
    "SeamlessM4Tv2NARTextToUnitForConditionalGeneration",
    "SeamlessM4Tv2CodeHifiGan",