Add auto model for image-text-to-text (#32472)

* Add Auto model for image-text-to-text * Remove donut from processing auto, add chameleon ti image text to text models * add qwen2_vl and llava_onevision * add pixtral to auto model for image-text-to-text * add mllama and idefics3 * remove models in IGNORE_NON_AUTO_CONFIGURED * add AutoModelForImageTextToText to tests and doc
2024-10-08 14:26:43 +02:00
parent 0dbc7090ba
commit e2001c3413
11 changed files with 89 additions and 28 deletions
--- a/tests/models/kosmos2/test_modeling_kosmos2.py
+++ b/tests/models/kosmos2/test_modeling_kosmos2.py
@@ -23,7 +23,7 @@ import unittest
 import numpy as np
 import requests

-from transformers import AutoModelForVision2Seq, AutoProcessor, Kosmos2Config
+from transformers import AutoModelForImageTextToText, AutoProcessor, Kosmos2Config
 from transformers.models.kosmos2.configuration_kosmos2 import Kosmos2TextConfig, Kosmos2VisionConfig
 from transformers.testing_utils import IS_ROCM_SYSTEM, require_torch, require_vision, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
@@ -551,7 +551,7 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase):
        image.save("new_image.jpg")
        image = Image.open("new_image.jpg")

-        model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
+        model = AutoModelForImageTextToText.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
        processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

        prompt = "<grounding>An image of"
@@ -697,7 +697,7 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase):
        image.save("new_image.jpg")
        image = Image.open("new_image.jpg")

-        model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
+        model = AutoModelForImageTextToText.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)

        prompt = ["<grounding>Describe this image in detail:", "<grounding>An image of"]