Add auto model for image-text-to-text (#32472)

* Add Auto model for image-text-to-text

* Remove donut from processing auto, add chameleon ti image text to text models

* add qwen2_vl and llava_onevision

* add pixtral to auto model for image-text-to-text

* add mllama and idefics3

* remove models in IGNORE_NON_AUTO_CONFIGURED

* add AutoModelForImageTextToText to tests and doc
This commit is contained in:
Yoni Gozlan
2024-10-08 14:26:43 +02:00
committed by GitHub
parent 0dbc7090ba
commit e2001c3413
11 changed files with 89 additions and 28 deletions

View File

@@ -23,7 +23,7 @@ import unittest
import numpy as np
import requests
from transformers import AutoModelForVision2Seq, AutoProcessor, Kosmos2Config
from transformers import AutoModelForImageTextToText, AutoProcessor, Kosmos2Config
from transformers.models.kosmos2.configuration_kosmos2 import Kosmos2TextConfig, Kosmos2VisionConfig
from transformers.testing_utils import IS_ROCM_SYSTEM, require_torch, require_vision, slow, torch_device
from transformers.utils import is_torch_available, is_vision_available
@@ -551,7 +551,7 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase):
image.save("new_image.jpg")
image = Image.open("new_image.jpg")
model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
model = AutoModelForImageTextToText.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
prompt = "<grounding>An image of"
@@ -697,7 +697,7 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase):
image.save("new_image.jpg")
image = Image.open("new_image.jpg")
model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
model = AutoModelForImageTextToText.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
prompt = ["<grounding>Describe this image in detail:", "<grounding>An image of"]