Add auto model for image-text-to-text (#32472)
* Add Auto model for image-text-to-text * Remove donut from processing auto, add chameleon ti image text to text models * add qwen2_vl and llava_onevision * add pixtral to auto model for image-text-to-text * add mllama and idefics3 * remove models in IGNORE_NON_AUTO_CONFIGURED * add AutoModelForImageTextToText to tests and doc
This commit is contained in:
@@ -381,3 +381,7 @@ The following auto classes are available for the following multimodal tasks.
|
|||||||
### FlaxAutoModelForVision2Seq
|
### FlaxAutoModelForVision2Seq
|
||||||
|
|
||||||
[[autodoc]] FlaxAutoModelForVision2Seq
|
[[autodoc]] FlaxAutoModelForVision2Seq
|
||||||
|
|
||||||
|
### AutoModelForImageTextToText
|
||||||
|
|
||||||
|
[[autodoc]] AutoModelForImageTextToText
|
||||||
|
|||||||
@@ -166,10 +166,10 @@ LLaVa-Next can perform inference with multiple images as input, where images eit
|
|||||||
import requests
|
import requests
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
import torch
|
import torch
|
||||||
from transformers import AutoProcessor, LlavaNextForConditionalGeneration
|
from transformers import AutoProcessor, AutoModelForImageTextToText
|
||||||
|
|
||||||
# Load the model in half-precision
|
# Load the model in half-precision
|
||||||
model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, device_map="auto")
|
model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, device_map="auto")
|
||||||
processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
|
processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
|
||||||
|
|
||||||
# Get three different images
|
# Get three different images
|
||||||
@@ -246,7 +246,7 @@ We value your feedback to help identify bugs before the full release! Check out
|
|||||||
Simply change the snippet above with:
|
Simply change the snippet above with:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from transformers import LlavaNextForConditionalGeneration, BitsAndBytesConfig
|
from transformers import AutoModelForImageTextToText, BitsAndBytesConfig
|
||||||
|
|
||||||
# specify how to quantize the model
|
# specify how to quantize the model
|
||||||
quantization_config = BitsAndBytesConfig(
|
quantization_config = BitsAndBytesConfig(
|
||||||
@@ -255,7 +255,7 @@ quantization_config = BitsAndBytesConfig(
|
|||||||
bnb_4bit_compute_dtype=torch.float16,
|
bnb_4bit_compute_dtype=torch.float16,
|
||||||
)
|
)
|
||||||
|
|
||||||
model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", quantization_config=quantization_config, device_map="auto")
|
model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", quantization_config=quantization_config, device_map="auto")
|
||||||
```
|
```
|
||||||
|
|
||||||
### Use Flash-Attention 2 to further speed-up generation
|
### Use Flash-Attention 2 to further speed-up generation
|
||||||
@@ -263,9 +263,9 @@ model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-m
|
|||||||
First make sure to install flash-attn. Refer to the [original repository of Flash Attention](https://github.com/Dao-AILab/flash-attention) regarding that package installation. Simply change the snippet above with:
|
First make sure to install flash-attn. Refer to the [original repository of Flash Attention](https://github.com/Dao-AILab/flash-attention) regarding that package installation. Simply change the snippet above with:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from transformers import LlavaNextForConditionalGeneration
|
from transformers import AutoModelForImageTextToText
|
||||||
|
|
||||||
model = LlavaNextForConditionalGeneration.from_pretrained(
|
model = AutoModelForImageTextToText.from_pretrained(
|
||||||
model_id,
|
model_id,
|
||||||
torch_dtype=torch.float16,
|
torch_dtype=torch.float16,
|
||||||
low_cpu_mem_usage=True,
|
low_cpu_mem_usage=True,
|
||||||
|
|||||||
@@ -27,22 +27,22 @@ To begin with, there are multiple types of VLMs:
|
|||||||
- chat fine-tuned models for conversation
|
- chat fine-tuned models for conversation
|
||||||
- instruction fine-tuned models
|
- instruction fine-tuned models
|
||||||
|
|
||||||
This guide focuses on inference with an instruction-tuned model.
|
This guide focuses on inference with an instruction-tuned model.
|
||||||
|
|
||||||
Let's begin installing the dependencies.
|
Let's begin installing the dependencies.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install -q transformers accelerate flash_attn
|
pip install -q transformers accelerate flash_attn
|
||||||
```
|
```
|
||||||
|
|
||||||
Let's initialize the model and the processor.
|
Let's initialize the model and the processor.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from transformers import AutoProcessor, Idefics2ForConditionalGeneration
|
from transformers import AutoProcessor, AutoModelForImageTextToText
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
device = torch.device("cuda")
|
device = torch.device("cuda")
|
||||||
model = Idefics2ForConditionalGeneration.from_pretrained(
|
model = AutoModelForImageTextToText.from_pretrained(
|
||||||
"HuggingFaceM4/idefics2-8b",
|
"HuggingFaceM4/idefics2-8b",
|
||||||
torch_dtype=torch.bfloat16,
|
torch_dtype=torch.bfloat16,
|
||||||
attn_implementation="flash_attention_2",
|
attn_implementation="flash_attention_2",
|
||||||
@@ -51,7 +51,7 @@ model = Idefics2ForConditionalGeneration.from_pretrained(
|
|||||||
processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b")
|
processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b")
|
||||||
```
|
```
|
||||||
|
|
||||||
This model has a [chat template](./chat_templating) that helps user parse chat outputs. Moreover, the model can also accept multiple images as input in a single conversation or message. We will now prepare the inputs.
|
This model has a [chat template](./chat_templating) that helps user parse chat outputs. Moreover, the model can also accept multiple images as input in a single conversation or message. We will now prepare the inputs.
|
||||||
|
|
||||||
The image inputs look like the following.
|
The image inputs look like the following.
|
||||||
|
|
||||||
@@ -74,7 +74,7 @@ images = [Image.open(requests.get(img_urls[0], stream=True).raw),
|
|||||||
Image.open(requests.get(img_urls[1], stream=True).raw)]
|
Image.open(requests.get(img_urls[1], stream=True).raw)]
|
||||||
```
|
```
|
||||||
|
|
||||||
Below is an example of the chat template. We can feed conversation turns and the last message as an input by appending it at the end of the template.
|
Below is an example of the chat template. We can feed conversation turns and the last message as an input by appending it at the end of the template.
|
||||||
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@@ -98,7 +98,7 @@ messages = [
|
|||||||
{"type": "image"},
|
{"type": "image"},
|
||||||
{"type": "text", "text": "And how about this image?"},
|
{"type": "text", "text": "And how about this image?"},
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -180,11 +180,11 @@ def model_inference(
|
|||||||
if acc_text.endswith("<end_of_utterance>"):
|
if acc_text.endswith("<end_of_utterance>"):
|
||||||
acc_text = acc_text[:-18]
|
acc_text = acc_text[:-18]
|
||||||
yield acc_text
|
yield acc_text
|
||||||
|
|
||||||
thread.join()
|
thread.join()
|
||||||
```
|
```
|
||||||
|
|
||||||
Now let's call the `model_inference` function we created and stream the values.
|
Now let's call the `model_inference` function we created and stream the values.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
generator = model_inference(
|
generator = model_inference(
|
||||||
@@ -204,7 +204,7 @@ for value in generator:
|
|||||||
|
|
||||||
## Fit models in smaller hardware
|
## Fit models in smaller hardware
|
||||||
|
|
||||||
VLMs are often large and need to be optimized to fit on smaller hardware. Transformers supports many model quantization libraries, and here we will only show int8 quantization with [Quanto](./quantization/quanto#quanto). int8 quantization offers memory improvements up to 75 percent (if all weights are quantized). However it is no free lunch, since 8-bit is not a CUDA-native precision, the weights are quantized back and forth on the fly, which adds up to latency.
|
VLMs are often large and need to be optimized to fit on smaller hardware. Transformers supports many model quantization libraries, and here we will only show int8 quantization with [Quanto](./quantization/quanto#quanto). int8 quantization offers memory improvements up to 75 percent (if all weights are quantized). However it is no free lunch, since 8-bit is not a CUDA-native precision, the weights are quantized back and forth on the fly, which adds up to latency.
|
||||||
|
|
||||||
First, install dependencies.
|
First, install dependencies.
|
||||||
|
|
||||||
@@ -215,18 +215,20 @@ pip install -U quanto bitsandbytes
|
|||||||
To quantize a model during loading, we need to first create [`QuantoConfig`]. Then load the model as usual, but pass `quantization_config` during model initialization.
|
To quantize a model during loading, we need to first create [`QuantoConfig`]. Then load the model as usual, but pass `quantization_config` during model initialization.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from transformers import Idefics2ForConditionalGeneration, AutoTokenizer, QuantoConfig
|
from transformers import AutoModelForImageTextToText, QuantoConfig
|
||||||
|
|
||||||
model_id = "HuggingFaceM4/idefics2-8b"
|
model_id = "HuggingFaceM4/idefics2-8b"
|
||||||
quantization_config = QuantoConfig(weights="int8")
|
quantization_config = QuantoConfig(weights="int8")
|
||||||
quantized_model = Idefics2ForConditionalGeneration.from_pretrained(model_id, device_map="cuda", quantization_config=quantization_config)
|
quantized_model = AutoModelForImageTextToText.from_pretrained(
|
||||||
|
model_id, device_map="cuda", quantization_config=quantization_config
|
||||||
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
And that's it, we can use the model the same way with no changes.
|
And that's it, we can use the model the same way with no changes.
|
||||||
|
|
||||||
## Further Reading
|
## Further Reading
|
||||||
|
|
||||||
Here are some more resources for the image-text-to-text task.
|
Here are some more resources for the image-text-to-text task.
|
||||||
|
|
||||||
- [Image-text-to-text task page](https://huggingface.co/tasks/image-text-to-text) covers model types, use cases, datasets, and more.
|
- [Image-text-to-text task page](https://huggingface.co/tasks/image-text-to-text) covers model types, use cases, datasets, and more.
|
||||||
- [Vision Language Models Explained](https://huggingface.co/blog/vlms) is a blog post that covers everything about vision language models and supervised fine-tuning using [TRL](https://huggingface.co/docs/trl/en/index).
|
- [Vision Language Models Explained](https://huggingface.co/blog/vlms) is a blog post that covers everything about vision language models and supervised fine-tuning using [TRL](https://huggingface.co/docs/trl/en/index).
|
||||||
|
|||||||
@@ -368,3 +368,7 @@ AutoModel.register(NewModelConfig, NewModel)
|
|||||||
### FlaxAutoModelForVision2Seq
|
### FlaxAutoModelForVision2Seq
|
||||||
|
|
||||||
[[autodoc]] FlaxAutoModelForVision2Seq
|
[[autodoc]] FlaxAutoModelForVision2Seq
|
||||||
|
|
||||||
|
### AutoModelForImageTextToText
|
||||||
|
|
||||||
|
[[autodoc]] AutoModelForImageTextToText
|
||||||
|
|||||||
@@ -1407,6 +1407,7 @@ else:
|
|||||||
"MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
|
"MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
|
||||||
"MODEL_FOR_IMAGE_MAPPING",
|
"MODEL_FOR_IMAGE_MAPPING",
|
||||||
"MODEL_FOR_IMAGE_SEGMENTATION_MAPPING",
|
"MODEL_FOR_IMAGE_SEGMENTATION_MAPPING",
|
||||||
|
"MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING",
|
||||||
"MODEL_FOR_IMAGE_TO_IMAGE_MAPPING",
|
"MODEL_FOR_IMAGE_TO_IMAGE_MAPPING",
|
||||||
"MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING",
|
"MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING",
|
||||||
"MODEL_FOR_KEYPOINT_DETECTION_MAPPING",
|
"MODEL_FOR_KEYPOINT_DETECTION_MAPPING",
|
||||||
@@ -1448,6 +1449,7 @@ else:
|
|||||||
"AutoModelForDocumentQuestionAnswering",
|
"AutoModelForDocumentQuestionAnswering",
|
||||||
"AutoModelForImageClassification",
|
"AutoModelForImageClassification",
|
||||||
"AutoModelForImageSegmentation",
|
"AutoModelForImageSegmentation",
|
||||||
|
"AutoModelForImageTextToText",
|
||||||
"AutoModelForImageToImage",
|
"AutoModelForImageToImage",
|
||||||
"AutoModelForInstanceSegmentation",
|
"AutoModelForInstanceSegmentation",
|
||||||
"AutoModelForKeypointDetection",
|
"AutoModelForKeypointDetection",
|
||||||
@@ -6272,6 +6274,7 @@ if TYPE_CHECKING:
|
|||||||
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
|
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
|
||||||
MODEL_FOR_IMAGE_MAPPING,
|
MODEL_FOR_IMAGE_MAPPING,
|
||||||
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
|
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
|
||||||
|
MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING,
|
||||||
MODEL_FOR_IMAGE_TO_IMAGE_MAPPING,
|
MODEL_FOR_IMAGE_TO_IMAGE_MAPPING,
|
||||||
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
|
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
|
||||||
MODEL_FOR_KEYPOINT_DETECTION_MAPPING,
|
MODEL_FOR_KEYPOINT_DETECTION_MAPPING,
|
||||||
@@ -6313,6 +6316,7 @@ if TYPE_CHECKING:
|
|||||||
AutoModelForDocumentQuestionAnswering,
|
AutoModelForDocumentQuestionAnswering,
|
||||||
AutoModelForImageClassification,
|
AutoModelForImageClassification,
|
||||||
AutoModelForImageSegmentation,
|
AutoModelForImageSegmentation,
|
||||||
|
AutoModelForImageTextToText,
|
||||||
AutoModelForImageToImage,
|
AutoModelForImageToImage,
|
||||||
AutoModelForInstanceSegmentation,
|
AutoModelForInstanceSegmentation,
|
||||||
AutoModelForKeypointDetection,
|
AutoModelForKeypointDetection,
|
||||||
|
|||||||
@@ -74,6 +74,7 @@ else:
|
|||||||
"MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
|
"MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
|
||||||
"MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
|
"MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
|
||||||
"MODEL_FOR_VISION_2_SEQ_MAPPING",
|
"MODEL_FOR_VISION_2_SEQ_MAPPING",
|
||||||
|
"MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING",
|
||||||
"MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
|
"MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
|
||||||
"MODEL_MAPPING",
|
"MODEL_MAPPING",
|
||||||
"MODEL_WITH_LM_HEAD_MAPPING",
|
"MODEL_WITH_LM_HEAD_MAPPING",
|
||||||
@@ -119,6 +120,7 @@ else:
|
|||||||
"AutoModelWithLMHead",
|
"AutoModelWithLMHead",
|
||||||
"AutoModelForZeroShotImageClassification",
|
"AutoModelForZeroShotImageClassification",
|
||||||
"AutoModelForZeroShotObjectDetection",
|
"AutoModelForZeroShotObjectDetection",
|
||||||
|
"AutoModelForImageTextToText",
|
||||||
]
|
]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -238,6 +240,7 @@ if TYPE_CHECKING:
|
|||||||
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
|
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
|
||||||
MODEL_FOR_IMAGE_MAPPING,
|
MODEL_FOR_IMAGE_MAPPING,
|
||||||
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
|
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
|
||||||
|
MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING,
|
||||||
MODEL_FOR_IMAGE_TO_IMAGE_MAPPING,
|
MODEL_FOR_IMAGE_TO_IMAGE_MAPPING,
|
||||||
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
|
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
|
||||||
MODEL_FOR_KEYPOINT_DETECTION_MAPPING,
|
MODEL_FOR_KEYPOINT_DETECTION_MAPPING,
|
||||||
@@ -279,6 +282,7 @@ if TYPE_CHECKING:
|
|||||||
AutoModelForDocumentQuestionAnswering,
|
AutoModelForDocumentQuestionAnswering,
|
||||||
AutoModelForImageClassification,
|
AutoModelForImageClassification,
|
||||||
AutoModelForImageSegmentation,
|
AutoModelForImageSegmentation,
|
||||||
|
AutoModelForImageTextToText,
|
||||||
AutoModelForImageToImage,
|
AutoModelForImageToImage,
|
||||||
AutoModelForInstanceSegmentation,
|
AutoModelForInstanceSegmentation,
|
||||||
AutoModelForKeypointDetection,
|
AutoModelForKeypointDetection,
|
||||||
|
|||||||
@@ -757,6 +757,32 @@ MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict(
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict(
|
||||||
|
[
|
||||||
|
("blip", "BlipForConditionalGeneration"),
|
||||||
|
("blip-2", "Blip2ForConditionalGeneration"),
|
||||||
|
("chameleon", "ChameleonForConditionalGeneration"),
|
||||||
|
("fuyu", "FuyuForCausalLM"),
|
||||||
|
("git", "GitForCausalLM"),
|
||||||
|
("idefics", "IdeficsForVisionText2Text"),
|
||||||
|
("idefics2", "Idefics2ForConditionalGeneration"),
|
||||||
|
("idefics3", "Idefics3ForConditionalGeneration"),
|
||||||
|
("instructblip", "InstructBlipForConditionalGeneration"),
|
||||||
|
("kosmos-2", "Kosmos2ForConditionalGeneration"),
|
||||||
|
("llava", "LlavaForConditionalGeneration"),
|
||||||
|
("llava_next", "LlavaNextForConditionalGeneration"),
|
||||||
|
("llava_onevision", "LlavaOnevisionForConditionalGeneration"),
|
||||||
|
("mllama", "MllamaForConditionalGeneration"),
|
||||||
|
("paligemma", "PaliGemmaForConditionalGeneration"),
|
||||||
|
("pix2struct", "Pix2StructForConditionalGeneration"),
|
||||||
|
("pixtral", "LlavaForConditionalGeneration"),
|
||||||
|
("qwen2_vl", "Qwen2VLForConditionalGeneration"),
|
||||||
|
("udop", "UdopForConditionalGeneration"),
|
||||||
|
("vipllava", "VipLlavaForConditionalGeneration"),
|
||||||
|
("vision-encoder-decoder", "VisionEncoderDecoderModel"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
|
MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
|
||||||
[
|
[
|
||||||
# Model for Masked LM mapping
|
# Model for Masked LM mapping
|
||||||
@@ -1419,6 +1445,9 @@ MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING = _LazyAutoMapping(
|
|||||||
CONFIG_MAPPING_NAMES, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES
|
CONFIG_MAPPING_NAMES, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES
|
||||||
)
|
)
|
||||||
MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES)
|
MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES)
|
||||||
|
MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING = _LazyAutoMapping(
|
||||||
|
CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
|
||||||
|
)
|
||||||
MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
|
MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
|
||||||
CONFIG_MAPPING_NAMES, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
|
CONFIG_MAPPING_NAMES, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
|
||||||
)
|
)
|
||||||
@@ -1713,6 +1742,13 @@ class AutoModelForVision2Seq(_BaseAutoModelClass):
|
|||||||
AutoModelForVision2Seq = auto_class_update(AutoModelForVision2Seq, head_doc="vision-to-text modeling")
|
AutoModelForVision2Seq = auto_class_update(AutoModelForVision2Seq, head_doc="vision-to-text modeling")
|
||||||
|
|
||||||
|
|
||||||
|
class AutoModelForImageTextToText(_BaseAutoModelClass):
|
||||||
|
_model_mapping = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING
|
||||||
|
|
||||||
|
|
||||||
|
AutoModelForImageTextToText = auto_class_update(AutoModelForImageTextToText, head_doc="image-text-to-text modeling")
|
||||||
|
|
||||||
|
|
||||||
class AutoModelForAudioClassification(_BaseAutoModelClass):
|
class AutoModelForAudioClassification(_BaseAutoModelClass):
|
||||||
_model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
|
_model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
|
||||||
|
|
||||||
|
|||||||
@@ -99,6 +99,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
|
|||||||
("trocr", "TrOCRProcessor"),
|
("trocr", "TrOCRProcessor"),
|
||||||
("tvlt", "TvltProcessor"),
|
("tvlt", "TvltProcessor"),
|
||||||
("tvp", "TvpProcessor"),
|
("tvp", "TvpProcessor"),
|
||||||
|
("udop", "UdopProcessor"),
|
||||||
("unispeech", "Wav2Vec2Processor"),
|
("unispeech", "Wav2Vec2Processor"),
|
||||||
("unispeech-sat", "Wav2Vec2Processor"),
|
("unispeech-sat", "Wav2Vec2Processor"),
|
||||||
("video_llava", "VideoLlavaProcessor"),
|
("video_llava", "VideoLlavaProcessor"),
|
||||||
|
|||||||
@@ -707,6 +707,9 @@ MODEL_FOR_IMAGE_MAPPING = None
|
|||||||
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING = None
|
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING = None
|
||||||
|
|
||||||
|
|
||||||
|
MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING = None
|
||||||
|
|
||||||
|
|
||||||
MODEL_FOR_IMAGE_TO_IMAGE_MAPPING = None
|
MODEL_FOR_IMAGE_TO_IMAGE_MAPPING = None
|
||||||
|
|
||||||
|
|
||||||
@@ -874,6 +877,13 @@ class AutoModelForImageSegmentation(metaclass=DummyObject):
|
|||||||
requires_backends(self, ["torch"])
|
requires_backends(self, ["torch"])
|
||||||
|
|
||||||
|
|
||||||
|
class AutoModelForImageTextToText(metaclass=DummyObject):
|
||||||
|
_backends = ["torch"]
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
requires_backends(self, ["torch"])
|
||||||
|
|
||||||
|
|
||||||
class AutoModelForImageToImage(metaclass=DummyObject):
|
class AutoModelForImageToImage(metaclass=DummyObject):
|
||||||
_backends = ["torch"]
|
_backends = ["torch"]
|
||||||
|
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ import unittest
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from transformers import AutoModelForVision2Seq, AutoProcessor, Kosmos2Config
|
from transformers import AutoModelForImageTextToText, AutoProcessor, Kosmos2Config
|
||||||
from transformers.models.kosmos2.configuration_kosmos2 import Kosmos2TextConfig, Kosmos2VisionConfig
|
from transformers.models.kosmos2.configuration_kosmos2 import Kosmos2TextConfig, Kosmos2VisionConfig
|
||||||
from transformers.testing_utils import IS_ROCM_SYSTEM, require_torch, require_vision, slow, torch_device
|
from transformers.testing_utils import IS_ROCM_SYSTEM, require_torch, require_vision, slow, torch_device
|
||||||
from transformers.utils import is_torch_available, is_vision_available
|
from transformers.utils import is_torch_available, is_vision_available
|
||||||
@@ -551,7 +551,7 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase):
|
|||||||
image.save("new_image.jpg")
|
image.save("new_image.jpg")
|
||||||
image = Image.open("new_image.jpg")
|
image = Image.open("new_image.jpg")
|
||||||
|
|
||||||
model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
|
model = AutoModelForImageTextToText.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
|
||||||
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
|
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
|
||||||
|
|
||||||
prompt = "<grounding>An image of"
|
prompt = "<grounding>An image of"
|
||||||
@@ -697,7 +697,7 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase):
|
|||||||
image.save("new_image.jpg")
|
image.save("new_image.jpg")
|
||||||
image = Image.open("new_image.jpg")
|
image = Image.open("new_image.jpg")
|
||||||
|
|
||||||
model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
|
model = AutoModelForImageTextToText.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
|
||||||
|
|
||||||
prompt = ["<grounding>Describe this image in detail:", "<grounding>An image of"]
|
prompt = ["<grounding>Describe this image in detail:", "<grounding>An image of"]
|
||||||
|
|
||||||
|
|||||||
@@ -170,7 +170,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
|
|||||||
"ClapTextModelWithProjection",
|
"ClapTextModelWithProjection",
|
||||||
"ClapAudioModel",
|
"ClapAudioModel",
|
||||||
"ClapAudioModelWithProjection",
|
"ClapAudioModelWithProjection",
|
||||||
"Blip2ForConditionalGeneration",
|
|
||||||
"Blip2TextModelWithProjection",
|
"Blip2TextModelWithProjection",
|
||||||
"Blip2VisionModelWithProjection",
|
"Blip2VisionModelWithProjection",
|
||||||
"Blip2QFormerModel",
|
"Blip2QFormerModel",
|
||||||
@@ -181,7 +180,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
|
|||||||
"GitVisionModel",
|
"GitVisionModel",
|
||||||
"GraphormerModel",
|
"GraphormerModel",
|
||||||
"GraphormerForGraphClassification",
|
"GraphormerForGraphClassification",
|
||||||
"BlipForConditionalGeneration",
|
|
||||||
"BlipForImageTextRetrieval",
|
"BlipForImageTextRetrieval",
|
||||||
"BlipForQuestionAnswering",
|
"BlipForQuestionAnswering",
|
||||||
"BlipVisionModel",
|
"BlipVisionModel",
|
||||||
@@ -245,7 +243,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
|
|||||||
"DetrForSegmentation",
|
"DetrForSegmentation",
|
||||||
"Pix2StructVisionModel",
|
"Pix2StructVisionModel",
|
||||||
"Pix2StructTextModel",
|
"Pix2StructTextModel",
|
||||||
"Pix2StructForConditionalGeneration",
|
|
||||||
"ConditionalDetrForSegmentation",
|
"ConditionalDetrForSegmentation",
|
||||||
"DPRReader",
|
"DPRReader",
|
||||||
"FlaubertForQuestionAnswering",
|
"FlaubertForQuestionAnswering",
|
||||||
@@ -322,7 +319,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
|
|||||||
"SeamlessM4TCodeHifiGan",
|
"SeamlessM4TCodeHifiGan",
|
||||||
"SeamlessM4TForSpeechToSpeech", # no auto class for speech-to-speech
|
"SeamlessM4TForSpeechToSpeech", # no auto class for speech-to-speech
|
||||||
"TvpForVideoGrounding",
|
"TvpForVideoGrounding",
|
||||||
"UdopForConditionalGeneration",
|
|
||||||
"SeamlessM4Tv2NARTextToUnitModel",
|
"SeamlessM4Tv2NARTextToUnitModel",
|
||||||
"SeamlessM4Tv2NARTextToUnitForConditionalGeneration",
|
"SeamlessM4Tv2NARTextToUnitForConditionalGeneration",
|
||||||
"SeamlessM4Tv2CodeHifiGan",
|
"SeamlessM4Tv2CodeHifiGan",
|
||||||
|
|||||||
Reference in New Issue
Block a user