Add auto model for image-text-to-text (#32472)
* Add Auto model for image-text-to-text * Remove donut from processing auto, add chameleon ti image text to text models * add qwen2_vl and llava_onevision * add pixtral to auto model for image-text-to-text * add mllama and idefics3 * remove models in IGNORE_NON_AUTO_CONFIGURED * add AutoModelForImageTextToText to tests and doc
This commit is contained in:
@@ -381,3 +381,7 @@ The following auto classes are available for the following multimodal tasks.
|
||||
### FlaxAutoModelForVision2Seq
|
||||
|
||||
[[autodoc]] FlaxAutoModelForVision2Seq
|
||||
|
||||
### AutoModelForImageTextToText
|
||||
|
||||
[[autodoc]] AutoModelForImageTextToText
|
||||
|
||||
@@ -166,10 +166,10 @@ LLaVa-Next can perform inference with multiple images as input, where images eit
|
||||
import requests
|
||||
from PIL import Image
|
||||
import torch
|
||||
from transformers import AutoProcessor, LlavaNextForConditionalGeneration
|
||||
from transformers import AutoProcessor, AutoModelForImageTextToText
|
||||
|
||||
# Load the model in half-precision
|
||||
model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, device_map="auto")
|
||||
model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, device_map="auto")
|
||||
processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
|
||||
|
||||
# Get three different images
|
||||
@@ -246,7 +246,7 @@ We value your feedback to help identify bugs before the full release! Check out
|
||||
Simply change the snippet above with:
|
||||
|
||||
```python
|
||||
from transformers import LlavaNextForConditionalGeneration, BitsAndBytesConfig
|
||||
from transformers import AutoModelForImageTextToText, BitsAndBytesConfig
|
||||
|
||||
# specify how to quantize the model
|
||||
quantization_config = BitsAndBytesConfig(
|
||||
@@ -255,7 +255,7 @@ quantization_config = BitsAndBytesConfig(
|
||||
bnb_4bit_compute_dtype=torch.float16,
|
||||
)
|
||||
|
||||
model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", quantization_config=quantization_config, device_map="auto")
|
||||
model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", quantization_config=quantization_config, device_map="auto")
|
||||
```
|
||||
|
||||
### Use Flash-Attention 2 to further speed-up generation
|
||||
@@ -263,9 +263,9 @@ model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-m
|
||||
First make sure to install flash-attn. Refer to the [original repository of Flash Attention](https://github.com/Dao-AILab/flash-attention) regarding that package installation. Simply change the snippet above with:
|
||||
|
||||
```python
|
||||
from transformers import LlavaNextForConditionalGeneration
|
||||
from transformers import AutoModelForImageTextToText
|
||||
|
||||
model = LlavaNextForConditionalGeneration.from_pretrained(
|
||||
model = AutoModelForImageTextToText.from_pretrained(
|
||||
model_id,
|
||||
torch_dtype=torch.float16,
|
||||
low_cpu_mem_usage=True,
|
||||
|
||||
@@ -38,11 +38,11 @@ pip install -q transformers accelerate flash_attn
|
||||
Let's initialize the model and the processor.
|
||||
|
||||
```python
|
||||
from transformers import AutoProcessor, Idefics2ForConditionalGeneration
|
||||
from transformers import AutoProcessor, AutoModelForImageTextToText
|
||||
import torch
|
||||
|
||||
device = torch.device("cuda")
|
||||
model = Idefics2ForConditionalGeneration.from_pretrained(
|
||||
model = AutoModelForImageTextToText.from_pretrained(
|
||||
"HuggingFaceM4/idefics2-8b",
|
||||
torch_dtype=torch.bfloat16,
|
||||
attn_implementation="flash_attention_2",
|
||||
@@ -215,11 +215,13 @@ pip install -U quanto bitsandbytes
|
||||
To quantize a model during loading, we need to first create [`QuantoConfig`]. Then load the model as usual, but pass `quantization_config` during model initialization.
|
||||
|
||||
```python
|
||||
from transformers import Idefics2ForConditionalGeneration, AutoTokenizer, QuantoConfig
|
||||
from transformers import AutoModelForImageTextToText, QuantoConfig
|
||||
|
||||
model_id = "HuggingFaceM4/idefics2-8b"
|
||||
quantization_config = QuantoConfig(weights="int8")
|
||||
quantized_model = Idefics2ForConditionalGeneration.from_pretrained(model_id, device_map="cuda", quantization_config=quantization_config)
|
||||
quantized_model = AutoModelForImageTextToText.from_pretrained(
|
||||
model_id, device_map="cuda", quantization_config=quantization_config
|
||||
)
|
||||
```
|
||||
|
||||
And that's it, we can use the model the same way with no changes.
|
||||
|
||||
@@ -368,3 +368,7 @@ AutoModel.register(NewModelConfig, NewModel)
|
||||
### FlaxAutoModelForVision2Seq
|
||||
|
||||
[[autodoc]] FlaxAutoModelForVision2Seq
|
||||
|
||||
### AutoModelForImageTextToText
|
||||
|
||||
[[autodoc]] AutoModelForImageTextToText
|
||||
|
||||
@@ -1407,6 +1407,7 @@ else:
|
||||
"MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
|
||||
"MODEL_FOR_IMAGE_MAPPING",
|
||||
"MODEL_FOR_IMAGE_SEGMENTATION_MAPPING",
|
||||
"MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING",
|
||||
"MODEL_FOR_IMAGE_TO_IMAGE_MAPPING",
|
||||
"MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING",
|
||||
"MODEL_FOR_KEYPOINT_DETECTION_MAPPING",
|
||||
@@ -1448,6 +1449,7 @@ else:
|
||||
"AutoModelForDocumentQuestionAnswering",
|
||||
"AutoModelForImageClassification",
|
||||
"AutoModelForImageSegmentation",
|
||||
"AutoModelForImageTextToText",
|
||||
"AutoModelForImageToImage",
|
||||
"AutoModelForInstanceSegmentation",
|
||||
"AutoModelForKeypointDetection",
|
||||
@@ -6272,6 +6274,7 @@ if TYPE_CHECKING:
|
||||
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
|
||||
MODEL_FOR_IMAGE_MAPPING,
|
||||
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
|
||||
MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING,
|
||||
MODEL_FOR_IMAGE_TO_IMAGE_MAPPING,
|
||||
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
|
||||
MODEL_FOR_KEYPOINT_DETECTION_MAPPING,
|
||||
@@ -6313,6 +6316,7 @@ if TYPE_CHECKING:
|
||||
AutoModelForDocumentQuestionAnswering,
|
||||
AutoModelForImageClassification,
|
||||
AutoModelForImageSegmentation,
|
||||
AutoModelForImageTextToText,
|
||||
AutoModelForImageToImage,
|
||||
AutoModelForInstanceSegmentation,
|
||||
AutoModelForKeypointDetection,
|
||||
|
||||
@@ -74,6 +74,7 @@ else:
|
||||
"MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
|
||||
"MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
|
||||
"MODEL_FOR_VISION_2_SEQ_MAPPING",
|
||||
"MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING",
|
||||
"MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
|
||||
"MODEL_MAPPING",
|
||||
"MODEL_WITH_LM_HEAD_MAPPING",
|
||||
@@ -119,6 +120,7 @@ else:
|
||||
"AutoModelWithLMHead",
|
||||
"AutoModelForZeroShotImageClassification",
|
||||
"AutoModelForZeroShotObjectDetection",
|
||||
"AutoModelForImageTextToText",
|
||||
]
|
||||
|
||||
try:
|
||||
@@ -238,6 +240,7 @@ if TYPE_CHECKING:
|
||||
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
|
||||
MODEL_FOR_IMAGE_MAPPING,
|
||||
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
|
||||
MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING,
|
||||
MODEL_FOR_IMAGE_TO_IMAGE_MAPPING,
|
||||
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
|
||||
MODEL_FOR_KEYPOINT_DETECTION_MAPPING,
|
||||
@@ -279,6 +282,7 @@ if TYPE_CHECKING:
|
||||
AutoModelForDocumentQuestionAnswering,
|
||||
AutoModelForImageClassification,
|
||||
AutoModelForImageSegmentation,
|
||||
AutoModelForImageTextToText,
|
||||
AutoModelForImageToImage,
|
||||
AutoModelForInstanceSegmentation,
|
||||
AutoModelForKeypointDetection,
|
||||
|
||||
@@ -757,6 +757,32 @@ MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict(
|
||||
]
|
||||
)
|
||||
|
||||
MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict(
|
||||
[
|
||||
("blip", "BlipForConditionalGeneration"),
|
||||
("blip-2", "Blip2ForConditionalGeneration"),
|
||||
("chameleon", "ChameleonForConditionalGeneration"),
|
||||
("fuyu", "FuyuForCausalLM"),
|
||||
("git", "GitForCausalLM"),
|
||||
("idefics", "IdeficsForVisionText2Text"),
|
||||
("idefics2", "Idefics2ForConditionalGeneration"),
|
||||
("idefics3", "Idefics3ForConditionalGeneration"),
|
||||
("instructblip", "InstructBlipForConditionalGeneration"),
|
||||
("kosmos-2", "Kosmos2ForConditionalGeneration"),
|
||||
("llava", "LlavaForConditionalGeneration"),
|
||||
("llava_next", "LlavaNextForConditionalGeneration"),
|
||||
("llava_onevision", "LlavaOnevisionForConditionalGeneration"),
|
||||
("mllama", "MllamaForConditionalGeneration"),
|
||||
("paligemma", "PaliGemmaForConditionalGeneration"),
|
||||
("pix2struct", "Pix2StructForConditionalGeneration"),
|
||||
("pixtral", "LlavaForConditionalGeneration"),
|
||||
("qwen2_vl", "Qwen2VLForConditionalGeneration"),
|
||||
("udop", "UdopForConditionalGeneration"),
|
||||
("vipllava", "VipLlavaForConditionalGeneration"),
|
||||
("vision-encoder-decoder", "VisionEncoderDecoderModel"),
|
||||
]
|
||||
)
|
||||
|
||||
MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
|
||||
[
|
||||
# Model for Masked LM mapping
|
||||
@@ -1419,6 +1445,9 @@ MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING = _LazyAutoMapping(
|
||||
CONFIG_MAPPING_NAMES, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES
|
||||
)
|
||||
MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES)
|
||||
MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING = _LazyAutoMapping(
|
||||
CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
|
||||
)
|
||||
MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
|
||||
CONFIG_MAPPING_NAMES, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
|
||||
)
|
||||
@@ -1713,6 +1742,13 @@ class AutoModelForVision2Seq(_BaseAutoModelClass):
|
||||
AutoModelForVision2Seq = auto_class_update(AutoModelForVision2Seq, head_doc="vision-to-text modeling")
|
||||
|
||||
|
||||
class AutoModelForImageTextToText(_BaseAutoModelClass):
|
||||
_model_mapping = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING
|
||||
|
||||
|
||||
AutoModelForImageTextToText = auto_class_update(AutoModelForImageTextToText, head_doc="image-text-to-text modeling")
|
||||
|
||||
|
||||
class AutoModelForAudioClassification(_BaseAutoModelClass):
|
||||
_model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
|
||||
|
||||
|
||||
@@ -99,6 +99,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
|
||||
("trocr", "TrOCRProcessor"),
|
||||
("tvlt", "TvltProcessor"),
|
||||
("tvp", "TvpProcessor"),
|
||||
("udop", "UdopProcessor"),
|
||||
("unispeech", "Wav2Vec2Processor"),
|
||||
("unispeech-sat", "Wav2Vec2Processor"),
|
||||
("video_llava", "VideoLlavaProcessor"),
|
||||
|
||||
@@ -707,6 +707,9 @@ MODEL_FOR_IMAGE_MAPPING = None
|
||||
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING = None
|
||||
|
||||
|
||||
MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING = None
|
||||
|
||||
|
||||
MODEL_FOR_IMAGE_TO_IMAGE_MAPPING = None
|
||||
|
||||
|
||||
@@ -874,6 +877,13 @@ class AutoModelForImageSegmentation(metaclass=DummyObject):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class AutoModelForImageTextToText(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class AutoModelForImageToImage(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ import unittest
|
||||
import numpy as np
|
||||
import requests
|
||||
|
||||
from transformers import AutoModelForVision2Seq, AutoProcessor, Kosmos2Config
|
||||
from transformers import AutoModelForImageTextToText, AutoProcessor, Kosmos2Config
|
||||
from transformers.models.kosmos2.configuration_kosmos2 import Kosmos2TextConfig, Kosmos2VisionConfig
|
||||
from transformers.testing_utils import IS_ROCM_SYSTEM, require_torch, require_vision, slow, torch_device
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
@@ -551,7 +551,7 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase):
|
||||
image.save("new_image.jpg")
|
||||
image = Image.open("new_image.jpg")
|
||||
|
||||
model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
|
||||
model = AutoModelForImageTextToText.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
|
||||
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
|
||||
|
||||
prompt = "<grounding>An image of"
|
||||
@@ -697,7 +697,7 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase):
|
||||
image.save("new_image.jpg")
|
||||
image = Image.open("new_image.jpg")
|
||||
|
||||
model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
|
||||
model = AutoModelForImageTextToText.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
|
||||
|
||||
prompt = ["<grounding>Describe this image in detail:", "<grounding>An image of"]
|
||||
|
||||
|
||||
@@ -170,7 +170,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
|
||||
"ClapTextModelWithProjection",
|
||||
"ClapAudioModel",
|
||||
"ClapAudioModelWithProjection",
|
||||
"Blip2ForConditionalGeneration",
|
||||
"Blip2TextModelWithProjection",
|
||||
"Blip2VisionModelWithProjection",
|
||||
"Blip2QFormerModel",
|
||||
@@ -181,7 +180,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
|
||||
"GitVisionModel",
|
||||
"GraphormerModel",
|
||||
"GraphormerForGraphClassification",
|
||||
"BlipForConditionalGeneration",
|
||||
"BlipForImageTextRetrieval",
|
||||
"BlipForQuestionAnswering",
|
||||
"BlipVisionModel",
|
||||
@@ -245,7 +243,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
|
||||
"DetrForSegmentation",
|
||||
"Pix2StructVisionModel",
|
||||
"Pix2StructTextModel",
|
||||
"Pix2StructForConditionalGeneration",
|
||||
"ConditionalDetrForSegmentation",
|
||||
"DPRReader",
|
||||
"FlaubertForQuestionAnswering",
|
||||
@@ -322,7 +319,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
|
||||
"SeamlessM4TCodeHifiGan",
|
||||
"SeamlessM4TForSpeechToSpeech", # no auto class for speech-to-speech
|
||||
"TvpForVideoGrounding",
|
||||
"UdopForConditionalGeneration",
|
||||
"SeamlessM4Tv2NARTextToUnitModel",
|
||||
"SeamlessM4Tv2NARTextToUnitForConditionalGeneration",
|
||||
"SeamlessM4Tv2CodeHifiGan",
|
||||
|
||||
Reference in New Issue
Block a user