Add auto model for image-text-to-text (#32472)

* Add Auto model for image-text-to-text

* Remove donut from processing auto, add chameleon ti image text to text models

* add qwen2_vl and llava_onevision

* add pixtral to auto model for image-text-to-text

* add mllama and idefics3

* remove models in IGNORE_NON_AUTO_CONFIGURED

* add AutoModelForImageTextToText to tests and doc
This commit is contained in:
Yoni Gozlan
2024-10-08 14:26:43 +02:00
committed by GitHub
parent 0dbc7090ba
commit e2001c3413
11 changed files with 89 additions and 28 deletions

View File

@@ -381,3 +381,7 @@ The following auto classes are available for the following multimodal tasks.
### FlaxAutoModelForVision2Seq
[[autodoc]] FlaxAutoModelForVision2Seq
### AutoModelForImageTextToText
[[autodoc]] AutoModelForImageTextToText

View File

@@ -166,10 +166,10 @@ LLaVa-Next can perform inference with multiple images as input, where images eit
import requests
from PIL import Image
import torch
from transformers import AutoProcessor, LlavaNextForConditionalGeneration
from transformers import AutoProcessor, AutoModelForImageTextToText
# Load the model in half-precision
model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, device_map="auto")
model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, device_map="auto")
processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
# Get three different images
@@ -246,7 +246,7 @@ We value your feedback to help identify bugs before the full release! Check out
Simply change the snippet above with:
```python
from transformers import LlavaNextForConditionalGeneration, BitsAndBytesConfig
from transformers import AutoModelForImageTextToText, BitsAndBytesConfig
# specify how to quantize the model
quantization_config = BitsAndBytesConfig(
@@ -255,7 +255,7 @@ quantization_config = BitsAndBytesConfig(
bnb_4bit_compute_dtype=torch.float16,
)
model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", quantization_config=quantization_config, device_map="auto")
model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", quantization_config=quantization_config, device_map="auto")
```
### Use Flash-Attention 2 to further speed-up generation
@@ -263,9 +263,9 @@ model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-m
First make sure to install flash-attn. Refer to the [original repository of Flash Attention](https://github.com/Dao-AILab/flash-attention) regarding that package installation. Simply change the snippet above with:
```python
from transformers import LlavaNextForConditionalGeneration
from transformers import AutoModelForImageTextToText
model = LlavaNextForConditionalGeneration.from_pretrained(
model = AutoModelForImageTextToText.from_pretrained(
model_id,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,

View File

@@ -38,11 +38,11 @@ pip install -q transformers accelerate flash_attn
Let's initialize the model and the processor.
```python
from transformers import AutoProcessor, Idefics2ForConditionalGeneration
from transformers import AutoProcessor, AutoModelForImageTextToText
import torch
device = torch.device("cuda")
model = Idefics2ForConditionalGeneration.from_pretrained(
model = AutoModelForImageTextToText.from_pretrained(
"HuggingFaceM4/idefics2-8b",
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
@@ -215,11 +215,13 @@ pip install -U quanto bitsandbytes
To quantize a model during loading, we need to first create [`QuantoConfig`]. Then load the model as usual, but pass `quantization_config` during model initialization.
```python
from transformers import Idefics2ForConditionalGeneration, AutoTokenizer, QuantoConfig
from transformers import AutoModelForImageTextToText, QuantoConfig
model_id = "HuggingFaceM4/idefics2-8b"
quantization_config = QuantoConfig(weights="int8")
quantized_model = Idefics2ForConditionalGeneration.from_pretrained(model_id, device_map="cuda", quantization_config=quantization_config)
quantized_model = AutoModelForImageTextToText.from_pretrained(
model_id, device_map="cuda", quantization_config=quantization_config
)
```
And that's it, we can use the model the same way with no changes.

View File

@@ -368,3 +368,7 @@ AutoModel.register(NewModelConfig, NewModel)
### FlaxAutoModelForVision2Seq
[[autodoc]] FlaxAutoModelForVision2Seq
### AutoModelForImageTextToText
[[autodoc]] AutoModelForImageTextToText

View File

@@ -1407,6 +1407,7 @@ else:
"MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
"MODEL_FOR_IMAGE_MAPPING",
"MODEL_FOR_IMAGE_SEGMENTATION_MAPPING",
"MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING",
"MODEL_FOR_IMAGE_TO_IMAGE_MAPPING",
"MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING",
"MODEL_FOR_KEYPOINT_DETECTION_MAPPING",
@@ -1448,6 +1449,7 @@ else:
"AutoModelForDocumentQuestionAnswering",
"AutoModelForImageClassification",
"AutoModelForImageSegmentation",
"AutoModelForImageTextToText",
"AutoModelForImageToImage",
"AutoModelForInstanceSegmentation",
"AutoModelForKeypointDetection",
@@ -6272,6 +6274,7 @@ if TYPE_CHECKING:
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
MODEL_FOR_IMAGE_MAPPING,
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING,
MODEL_FOR_IMAGE_TO_IMAGE_MAPPING,
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
MODEL_FOR_KEYPOINT_DETECTION_MAPPING,
@@ -6313,6 +6316,7 @@ if TYPE_CHECKING:
AutoModelForDocumentQuestionAnswering,
AutoModelForImageClassification,
AutoModelForImageSegmentation,
AutoModelForImageTextToText,
AutoModelForImageToImage,
AutoModelForInstanceSegmentation,
AutoModelForKeypointDetection,

View File

@@ -74,6 +74,7 @@ else:
"MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
"MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
"MODEL_FOR_VISION_2_SEQ_MAPPING",
"MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING",
"MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
"MODEL_MAPPING",
"MODEL_WITH_LM_HEAD_MAPPING",
@@ -119,6 +120,7 @@ else:
"AutoModelWithLMHead",
"AutoModelForZeroShotImageClassification",
"AutoModelForZeroShotObjectDetection",
"AutoModelForImageTextToText",
]
try:
@@ -238,6 +240,7 @@ if TYPE_CHECKING:
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
MODEL_FOR_IMAGE_MAPPING,
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING,
MODEL_FOR_IMAGE_TO_IMAGE_MAPPING,
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
MODEL_FOR_KEYPOINT_DETECTION_MAPPING,
@@ -279,6 +282,7 @@ if TYPE_CHECKING:
AutoModelForDocumentQuestionAnswering,
AutoModelForImageClassification,
AutoModelForImageSegmentation,
AutoModelForImageTextToText,
AutoModelForImageToImage,
AutoModelForInstanceSegmentation,
AutoModelForKeypointDetection,

View File

@@ -757,6 +757,32 @@ MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict(
]
)
MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict(
[
("blip", "BlipForConditionalGeneration"),
("blip-2", "Blip2ForConditionalGeneration"),
("chameleon", "ChameleonForConditionalGeneration"),
("fuyu", "FuyuForCausalLM"),
("git", "GitForCausalLM"),
("idefics", "IdeficsForVisionText2Text"),
("idefics2", "Idefics2ForConditionalGeneration"),
("idefics3", "Idefics3ForConditionalGeneration"),
("instructblip", "InstructBlipForConditionalGeneration"),
("kosmos-2", "Kosmos2ForConditionalGeneration"),
("llava", "LlavaForConditionalGeneration"),
("llava_next", "LlavaNextForConditionalGeneration"),
("llava_onevision", "LlavaOnevisionForConditionalGeneration"),
("mllama", "MllamaForConditionalGeneration"),
("paligemma", "PaliGemmaForConditionalGeneration"),
("pix2struct", "Pix2StructForConditionalGeneration"),
("pixtral", "LlavaForConditionalGeneration"),
("qwen2_vl", "Qwen2VLForConditionalGeneration"),
("udop", "UdopForConditionalGeneration"),
("vipllava", "VipLlavaForConditionalGeneration"),
("vision-encoder-decoder", "VisionEncoderDecoderModel"),
]
)
MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
[
# Model for Masked LM mapping
@@ -1419,6 +1445,9 @@ MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES
)
MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES)
MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
)
MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
)
@@ -1713,6 +1742,13 @@ class AutoModelForVision2Seq(_BaseAutoModelClass):
AutoModelForVision2Seq = auto_class_update(AutoModelForVision2Seq, head_doc="vision-to-text modeling")
class AutoModelForImageTextToText(_BaseAutoModelClass):
_model_mapping = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING
AutoModelForImageTextToText = auto_class_update(AutoModelForImageTextToText, head_doc="image-text-to-text modeling")
class AutoModelForAudioClassification(_BaseAutoModelClass):
_model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING

View File

@@ -99,6 +99,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
("trocr", "TrOCRProcessor"),
("tvlt", "TvltProcessor"),
("tvp", "TvpProcessor"),
("udop", "UdopProcessor"),
("unispeech", "Wav2Vec2Processor"),
("unispeech-sat", "Wav2Vec2Processor"),
("video_llava", "VideoLlavaProcessor"),

View File

@@ -707,6 +707,9 @@ MODEL_FOR_IMAGE_MAPPING = None
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING = None
MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING = None
MODEL_FOR_IMAGE_TO_IMAGE_MAPPING = None
@@ -874,6 +877,13 @@ class AutoModelForImageSegmentation(metaclass=DummyObject):
requires_backends(self, ["torch"])
class AutoModelForImageTextToText(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class AutoModelForImageToImage(metaclass=DummyObject):
_backends = ["torch"]

View File

@@ -23,7 +23,7 @@ import unittest
import numpy as np
import requests
from transformers import AutoModelForVision2Seq, AutoProcessor, Kosmos2Config
from transformers import AutoModelForImageTextToText, AutoProcessor, Kosmos2Config
from transformers.models.kosmos2.configuration_kosmos2 import Kosmos2TextConfig, Kosmos2VisionConfig
from transformers.testing_utils import IS_ROCM_SYSTEM, require_torch, require_vision, slow, torch_device
from transformers.utils import is_torch_available, is_vision_available
@@ -551,7 +551,7 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase):
image.save("new_image.jpg")
image = Image.open("new_image.jpg")
model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
model = AutoModelForImageTextToText.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
prompt = "<grounding>An image of"
@@ -697,7 +697,7 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase):
image.save("new_image.jpg")
image = Image.open("new_image.jpg")
model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
model = AutoModelForImageTextToText.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
prompt = ["<grounding>Describe this image in detail:", "<grounding>An image of"]

View File

@@ -170,7 +170,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
"ClapTextModelWithProjection",
"ClapAudioModel",
"ClapAudioModelWithProjection",
"Blip2ForConditionalGeneration",
"Blip2TextModelWithProjection",
"Blip2VisionModelWithProjection",
"Blip2QFormerModel",
@@ -181,7 +180,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
"GitVisionModel",
"GraphormerModel",
"GraphormerForGraphClassification",
"BlipForConditionalGeneration",
"BlipForImageTextRetrieval",
"BlipForQuestionAnswering",
"BlipVisionModel",
@@ -245,7 +243,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
"DetrForSegmentation",
"Pix2StructVisionModel",
"Pix2StructTextModel",
"Pix2StructForConditionalGeneration",
"ConditionalDetrForSegmentation",
"DPRReader",
"FlaubertForQuestionAnswering",
@@ -322,7 +319,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
"SeamlessM4TCodeHifiGan",
"SeamlessM4TForSpeechToSpeech", # no auto class for speech-to-speech
"TvpForVideoGrounding",
"UdopForConditionalGeneration",
"SeamlessM4Tv2NARTextToUnitModel",
"SeamlessM4Tv2NARTextToUnitForConditionalGeneration",
"SeamlessM4Tv2CodeHifiGan",