diff --git a/docs/source/en/model_doc/auto.md b/docs/source/en/model_doc/auto.md index ab42c24d83..0593128508 100644 --- a/docs/source/en/model_doc/auto.md +++ b/docs/source/en/model_doc/auto.md @@ -381,3 +381,7 @@ The following auto classes are available for the following multimodal tasks. ### FlaxAutoModelForVision2Seq [[autodoc]] FlaxAutoModelForVision2Seq + +### AutoModelForImageTextToText + +[[autodoc]] AutoModelForImageTextToText diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md index 4c041f4e89..b9146fbd33 100644 --- a/docs/source/en/model_doc/llava_next.md +++ b/docs/source/en/model_doc/llava_next.md @@ -166,10 +166,10 @@ LLaVa-Next can perform inference with multiple images as input, where images eit import requests from PIL import Image import torch -from transformers import AutoProcessor, LlavaNextForConditionalGeneration +from transformers import AutoProcessor, AutoModelForImageTextToText # Load the model in half-precision -model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, device_map="auto") +model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, device_map="auto") processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf") # Get three different images @@ -246,7 +246,7 @@ We value your feedback to help identify bugs before the full release! Check out Simply change the snippet above with: ```python -from transformers import LlavaNextForConditionalGeneration, BitsAndBytesConfig +from transformers import AutoModelForImageTextToText, BitsAndBytesConfig # specify how to quantize the model quantization_config = BitsAndBytesConfig( @@ -255,7 +255,7 @@ quantization_config = BitsAndBytesConfig( bnb_4bit_compute_dtype=torch.float16, ) -model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", quantization_config=quantization_config, device_map="auto") +model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", quantization_config=quantization_config, device_map="auto") ``` ### Use Flash-Attention 2 to further speed-up generation @@ -263,9 +263,9 @@ model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-m First make sure to install flash-attn. Refer to the [original repository of Flash Attention](https://github.com/Dao-AILab/flash-attention) regarding that package installation. Simply change the snippet above with: ```python -from transformers import LlavaNextForConditionalGeneration +from transformers import AutoModelForImageTextToText -model = LlavaNextForConditionalGeneration.from_pretrained( +model = AutoModelForImageTextToText.from_pretrained( model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, diff --git a/docs/source/en/tasks/image_text_to_text.md b/docs/source/en/tasks/image_text_to_text.md index 74f6a3408b..261abf9472 100644 --- a/docs/source/en/tasks/image_text_to_text.md +++ b/docs/source/en/tasks/image_text_to_text.md @@ -27,22 +27,22 @@ To begin with, there are multiple types of VLMs: - chat fine-tuned models for conversation - instruction fine-tuned models -This guide focuses on inference with an instruction-tuned model. +This guide focuses on inference with an instruction-tuned model. Let's begin installing the dependencies. ```bash -pip install -q transformers accelerate flash_attn +pip install -q transformers accelerate flash_attn ``` -Let's initialize the model and the processor. +Let's initialize the model and the processor. ```python -from transformers import AutoProcessor, Idefics2ForConditionalGeneration +from transformers import AutoProcessor, AutoModelForImageTextToText import torch device = torch.device("cuda") -model = Idefics2ForConditionalGeneration.from_pretrained( +model = AutoModelForImageTextToText.from_pretrained( "HuggingFaceM4/idefics2-8b", torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2", @@ -51,7 +51,7 @@ model = Idefics2ForConditionalGeneration.from_pretrained( processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b") ``` -This model has a [chat template](./chat_templating) that helps user parse chat outputs. Moreover, the model can also accept multiple images as input in a single conversation or message. We will now prepare the inputs. +This model has a [chat template](./chat_templating) that helps user parse chat outputs. Moreover, the model can also accept multiple images as input in a single conversation or message. We will now prepare the inputs. The image inputs look like the following. @@ -74,7 +74,7 @@ images = [Image.open(requests.get(img_urls[0], stream=True).raw), Image.open(requests.get(img_urls[1], stream=True).raw)] ``` -Below is an example of the chat template. We can feed conversation turns and the last message as an input by appending it at the end of the template. +Below is an example of the chat template. We can feed conversation turns and the last message as an input by appending it at the end of the template. ```python @@ -98,7 +98,7 @@ messages = [ {"type": "image"}, {"type": "text", "text": "And how about this image?"}, ] - }, + }, ] ``` @@ -180,11 +180,11 @@ def model_inference( if acc_text.endswith(""): acc_text = acc_text[:-18] yield acc_text - + thread.join() ``` -Now let's call the `model_inference` function we created and stream the values. +Now let's call the `model_inference` function we created and stream the values. ```python generator = model_inference( @@ -204,7 +204,7 @@ for value in generator: ## Fit models in smaller hardware -VLMs are often large and need to be optimized to fit on smaller hardware. Transformers supports many model quantization libraries, and here we will only show int8 quantization with [Quanto](./quantization/quanto#quanto). int8 quantization offers memory improvements up to 75 percent (if all weights are quantized). However it is no free lunch, since 8-bit is not a CUDA-native precision, the weights are quantized back and forth on the fly, which adds up to latency. +VLMs are often large and need to be optimized to fit on smaller hardware. Transformers supports many model quantization libraries, and here we will only show int8 quantization with [Quanto](./quantization/quanto#quanto). int8 quantization offers memory improvements up to 75 percent (if all weights are quantized). However it is no free lunch, since 8-bit is not a CUDA-native precision, the weights are quantized back and forth on the fly, which adds up to latency. First, install dependencies. @@ -215,18 +215,20 @@ pip install -U quanto bitsandbytes To quantize a model during loading, we need to first create [`QuantoConfig`]. Then load the model as usual, but pass `quantization_config` during model initialization. ```python -from transformers import Idefics2ForConditionalGeneration, AutoTokenizer, QuantoConfig +from transformers import AutoModelForImageTextToText, QuantoConfig model_id = "HuggingFaceM4/idefics2-8b" quantization_config = QuantoConfig(weights="int8") -quantized_model = Idefics2ForConditionalGeneration.from_pretrained(model_id, device_map="cuda", quantization_config=quantization_config) +quantized_model = AutoModelForImageTextToText.from_pretrained( + model_id, device_map="cuda", quantization_config=quantization_config +) ``` -And that's it, we can use the model the same way with no changes. +And that's it, we can use the model the same way with no changes. ## Further Reading Here are some more resources for the image-text-to-text task. -- [Image-text-to-text task page](https://huggingface.co/tasks/image-text-to-text) covers model types, use cases, datasets, and more. +- [Image-text-to-text task page](https://huggingface.co/tasks/image-text-to-text) covers model types, use cases, datasets, and more. - [Vision Language Models Explained](https://huggingface.co/blog/vlms) is a blog post that covers everything about vision language models and supervised fine-tuning using [TRL](https://huggingface.co/docs/trl/en/index). diff --git a/docs/source/ja/model_doc/auto.md b/docs/source/ja/model_doc/auto.md index d4baaf70e6..492c46c79e 100644 --- a/docs/source/ja/model_doc/auto.md +++ b/docs/source/ja/model_doc/auto.md @@ -368,3 +368,7 @@ AutoModel.register(NewModelConfig, NewModel) ### FlaxAutoModelForVision2Seq [[autodoc]] FlaxAutoModelForVision2Seq + +### AutoModelForImageTextToText + +[[autodoc]] AutoModelForImageTextToText diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index f8908f7d53..3d612f6c4c 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1407,6 +1407,7 @@ else: "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING", "MODEL_FOR_IMAGE_MAPPING", "MODEL_FOR_IMAGE_SEGMENTATION_MAPPING", + "MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING", "MODEL_FOR_IMAGE_TO_IMAGE_MAPPING", "MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING", "MODEL_FOR_KEYPOINT_DETECTION_MAPPING", @@ -1448,6 +1449,7 @@ else: "AutoModelForDocumentQuestionAnswering", "AutoModelForImageClassification", "AutoModelForImageSegmentation", + "AutoModelForImageTextToText", "AutoModelForImageToImage", "AutoModelForInstanceSegmentation", "AutoModelForKeypointDetection", @@ -6272,6 +6274,7 @@ if TYPE_CHECKING: MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, MODEL_FOR_IMAGE_MAPPING, MODEL_FOR_IMAGE_SEGMENTATION_MAPPING, + MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING, MODEL_FOR_IMAGE_TO_IMAGE_MAPPING, MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING, MODEL_FOR_KEYPOINT_DETECTION_MAPPING, @@ -6313,6 +6316,7 @@ if TYPE_CHECKING: AutoModelForDocumentQuestionAnswering, AutoModelForImageClassification, AutoModelForImageSegmentation, + AutoModelForImageTextToText, AutoModelForImageToImage, AutoModelForInstanceSegmentation, AutoModelForKeypointDetection, diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py index 3bb2b8e9d4..2ee0541a1a 100644 --- a/src/transformers/models/auto/__init__.py +++ b/src/transformers/models/auto/__init__.py @@ -74,6 +74,7 @@ else: "MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING", "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING", "MODEL_FOR_VISION_2_SEQ_MAPPING", + "MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING", "MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING", "MODEL_MAPPING", "MODEL_WITH_LM_HEAD_MAPPING", @@ -119,6 +120,7 @@ else: "AutoModelWithLMHead", "AutoModelForZeroShotImageClassification", "AutoModelForZeroShotObjectDetection", + "AutoModelForImageTextToText", ] try: @@ -238,6 +240,7 @@ if TYPE_CHECKING: MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, MODEL_FOR_IMAGE_MAPPING, MODEL_FOR_IMAGE_SEGMENTATION_MAPPING, + MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING, MODEL_FOR_IMAGE_TO_IMAGE_MAPPING, MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING, MODEL_FOR_KEYPOINT_DETECTION_MAPPING, @@ -279,6 +282,7 @@ if TYPE_CHECKING: AutoModelForDocumentQuestionAnswering, AutoModelForImageClassification, AutoModelForImageSegmentation, + AutoModelForImageTextToText, AutoModelForImageToImage, AutoModelForInstanceSegmentation, AutoModelForKeypointDetection, diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 8b990ba1f0..aa0d59de52 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -757,6 +757,32 @@ MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict( ] ) +MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict( + [ + ("blip", "BlipForConditionalGeneration"), + ("blip-2", "Blip2ForConditionalGeneration"), + ("chameleon", "ChameleonForConditionalGeneration"), + ("fuyu", "FuyuForCausalLM"), + ("git", "GitForCausalLM"), + ("idefics", "IdeficsForVisionText2Text"), + ("idefics2", "Idefics2ForConditionalGeneration"), + ("idefics3", "Idefics3ForConditionalGeneration"), + ("instructblip", "InstructBlipForConditionalGeneration"), + ("kosmos-2", "Kosmos2ForConditionalGeneration"), + ("llava", "LlavaForConditionalGeneration"), + ("llava_next", "LlavaNextForConditionalGeneration"), + ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), + ("mllama", "MllamaForConditionalGeneration"), + ("paligemma", "PaliGemmaForConditionalGeneration"), + ("pix2struct", "Pix2StructForConditionalGeneration"), + ("pixtral", "LlavaForConditionalGeneration"), + ("qwen2_vl", "Qwen2VLForConditionalGeneration"), + ("udop", "UdopForConditionalGeneration"), + ("vipllava", "VipLlavaForConditionalGeneration"), + ("vision-encoder-decoder", "VisionEncoderDecoderModel"), + ] +) + MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict( [ # Model for Masked LM mapping @@ -1419,6 +1445,9 @@ MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES ) MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES) +MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES +) MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES ) @@ -1713,6 +1742,13 @@ class AutoModelForVision2Seq(_BaseAutoModelClass): AutoModelForVision2Seq = auto_class_update(AutoModelForVision2Seq, head_doc="vision-to-text modeling") +class AutoModelForImageTextToText(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING + + +AutoModelForImageTextToText = auto_class_update(AutoModelForImageTextToText, head_doc="image-text-to-text modeling") + + class AutoModelForAudioClassification(_BaseAutoModelClass): _model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index c894840c6a..c1f23bc1cb 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -99,6 +99,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict( ("trocr", "TrOCRProcessor"), ("tvlt", "TvltProcessor"), ("tvp", "TvpProcessor"), + ("udop", "UdopProcessor"), ("unispeech", "Wav2Vec2Processor"), ("unispeech-sat", "Wav2Vec2Processor"), ("video_llava", "VideoLlavaProcessor"), diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index ea0bbc1701..048de1cc8a 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -707,6 +707,9 @@ MODEL_FOR_IMAGE_MAPPING = None MODEL_FOR_IMAGE_SEGMENTATION_MAPPING = None +MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING = None + + MODEL_FOR_IMAGE_TO_IMAGE_MAPPING = None @@ -874,6 +877,13 @@ class AutoModelForImageSegmentation(metaclass=DummyObject): requires_backends(self, ["torch"]) +class AutoModelForImageTextToText(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class AutoModelForImageToImage(metaclass=DummyObject): _backends = ["torch"] diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index 913111c0a0..8e211310fb 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -23,7 +23,7 @@ import unittest import numpy as np import requests -from transformers import AutoModelForVision2Seq, AutoProcessor, Kosmos2Config +from transformers import AutoModelForImageTextToText, AutoProcessor, Kosmos2Config from transformers.models.kosmos2.configuration_kosmos2 import Kosmos2TextConfig, Kosmos2VisionConfig from transformers.testing_utils import IS_ROCM_SYSTEM, require_torch, require_vision, slow, torch_device from transformers.utils import is_torch_available, is_vision_available @@ -551,7 +551,7 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase): image.save("new_image.jpg") image = Image.open("new_image.jpg") - model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device) + model = AutoModelForImageTextToText.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device) processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224") prompt = "An image of" @@ -697,7 +697,7 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase): image.save("new_image.jpg") image = Image.open("new_image.jpg") - model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device) + model = AutoModelForImageTextToText.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device) prompt = ["Describe this image in detail:", "An image of"] diff --git a/utils/check_repo.py b/utils/check_repo.py index 75bd2ed1c6..3ecbd79eca 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -170,7 +170,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [ "ClapTextModelWithProjection", "ClapAudioModel", "ClapAudioModelWithProjection", - "Blip2ForConditionalGeneration", "Blip2TextModelWithProjection", "Blip2VisionModelWithProjection", "Blip2QFormerModel", @@ -181,7 +180,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [ "GitVisionModel", "GraphormerModel", "GraphormerForGraphClassification", - "BlipForConditionalGeneration", "BlipForImageTextRetrieval", "BlipForQuestionAnswering", "BlipVisionModel", @@ -245,7 +243,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [ "DetrForSegmentation", "Pix2StructVisionModel", "Pix2StructTextModel", - "Pix2StructForConditionalGeneration", "ConditionalDetrForSegmentation", "DPRReader", "FlaubertForQuestionAnswering", @@ -322,7 +319,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [ "SeamlessM4TCodeHifiGan", "SeamlessM4TForSpeechToSpeech", # no auto class for speech-to-speech "TvpForVideoGrounding", - "UdopForConditionalGeneration", "SeamlessM4Tv2NARTextToUnitModel", "SeamlessM4Tv2NARTextToUnitForConditionalGeneration", "SeamlessM4Tv2CodeHifiGan",