Add auto model for image-text-to-text (#32472)

* Add Auto model for image-text-to-text * Remove donut from processing auto, add chameleon ti image text to text models * add qwen2_vl and llava_onevision * add pixtral to auto model for image-text-to-text * add mllama and idefics3 * remove models in IGNORE_NON_AUTO_CONFIGURED * add AutoModelForImageTextToText to tests and doc
2024-10-08 14:26:43 +02:00
parent 0dbc7090ba
commit e2001c3413
11 changed files with 89 additions and 28 deletions
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -170,7 +170,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
    "ClapTextModelWithProjection",
    "ClapAudioModel",
    "ClapAudioModelWithProjection",
-    "Blip2ForConditionalGeneration",
    "Blip2TextModelWithProjection",
    "Blip2VisionModelWithProjection",
    "Blip2QFormerModel",
@@ -181,7 +180,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
    "GitVisionModel",
    "GraphormerModel",
    "GraphormerForGraphClassification",
-    "BlipForConditionalGeneration",
    "BlipForImageTextRetrieval",
    "BlipForQuestionAnswering",
    "BlipVisionModel",
@@ -245,7 +243,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
    "DetrForSegmentation",
    "Pix2StructVisionModel",
    "Pix2StructTextModel",
-    "Pix2StructForConditionalGeneration",
    "ConditionalDetrForSegmentation",
    "DPRReader",
    "FlaubertForQuestionAnswering",
@@ -322,7 +319,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
    "SeamlessM4TCodeHifiGan",
    "SeamlessM4TForSpeechToSpeech",  # no auto class for speech-to-speech
    "TvpForVideoGrounding",
-    "UdopForConditionalGeneration",
    "SeamlessM4Tv2NARTextToUnitModel",
    "SeamlessM4Tv2NARTextToUnitForConditionalGeneration",
    "SeamlessM4Tv2CodeHifiGan",