From b8db265bc6d0c9208ee465a12c6497149b4ee725 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Thu, 23 Nov 2023 21:00:39 +0100
Subject: [PATCH] Update tiny model summary file (#27388)

* update

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .../models/auto/image_processing_auto.py      |   1 +
 tests/models/clvp/test_modeling_clvp.py       |   4 +-
 tests/models/fuyu/test_modeling_fuyu.py       |   5 +-
 tests/models/kosmos2/test_modeling_kosmos2.py |  14 +-
 .../test_modeling_seamless_m4t.py             |  18 +-
 tests/models/swin2sr/test_modeling_swin2sr.py |   6 +-
 tests/models/whisper/test_modeling_whisper.py |   1 +
 .../test_pipelines_text_generation.py         |   7 +-
 tests/utils/tiny_model_summary.json           | 231 +++++++++++++++++-
 9 files changed, 277 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 168b7a5dff..7d26d668ab 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -71,6 +71,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
         ("idefics", "IdeficsImageProcessor"),
         ("imagegpt", "ImageGPTImageProcessor"),
         ("instructblip", "BlipImageProcessor"),
+        ("kosmos-2", "CLIPImageProcessor"),
         ("layoutlmv2", "LayoutLMv2ImageProcessor"),
         ("layoutlmv3", "LayoutLMv3ImageProcessor"),
         ("levit", "LevitImageProcessor"),
diff --git a/tests/models/clvp/test_modeling_clvp.py b/tests/models/clvp/test_modeling_clvp.py
index 1b3ab79034..3ebe5fe357 100644
--- a/tests/models/clvp/test_modeling_clvp.py
+++ b/tests/models/clvp/test_modeling_clvp.py
@@ -38,6 +38,7 @@ from ...test_modeling_common import (
     ids_tensor,
     random_attention_mask,
 )
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -281,9 +282,10 @@ class ClvpDecoderTester:
 
 
 @require_torch
-class ClvpDecoderTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class ClvpDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (ClvpModel, ClvpForCausalLM) if is_torch_available() else ()
     all_generative_model_classes = (ClvpForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": ClvpModelForConditionalGeneration} if is_torch_available() else {}
 
     test_pruning = False
 
diff --git a/tests/models/fuyu/test_modeling_fuyu.py b/tests/models/fuyu/test_modeling_fuyu.py
index d475e1e0ca..84c9128892 100644
--- a/tests/models/fuyu/test_modeling_fuyu.py
+++ b/tests/models/fuyu/test_modeling_fuyu.py
@@ -24,6 +24,7 @@ from transformers.testing_utils import require_torch, require_torch_gpu, slow, t
 from transformers.utils import cached_property
 
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_vision_available():
@@ -262,9 +263,9 @@ class FuyuModelTester:
 
 
 @require_torch
-class FuyuModelTest(ModelTesterMixin, unittest.TestCase):
+class FuyuModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (FuyuForCausalLM,) if is_torch_available() else ()
-    pipeline_model_mapping = {"image-to-text": FuyuForCausalLM} if is_torch_available() else {}
+    pipeline_model_mapping = {"text-generation": FuyuForCausalLM} if is_torch_available() else {}
 
     test_head_masking = False
     test_pruning = False
diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py
index 5491ded1bc..dd953eedc8 100644
--- a/tests/models/kosmos2/test_modeling_kosmos2.py
+++ b/tests/models/kosmos2/test_modeling_kosmos2.py
@@ -37,6 +37,7 @@ from ...test_modeling_common import (
     ids_tensor,
     random_attention_mask,
 )
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -244,15 +245,26 @@ class Kosmos2ModelTester:
 
 
 @require_torch
-class Kosmos2ModelTest(ModelTesterMixin, unittest.TestCase):
+class Kosmos2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (Kosmos2Model, Kosmos2ForConditionalGeneration) if is_torch_available() else ()
     all_generative_model_classes = (Kosmos2ForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": Kosmos2Model, "image-to-text": Kosmos2ForConditionalGeneration}
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
     test_resize_embeddings = False
     test_attention_outputs = False
 
+    # TODO: `image-to-text` pipeline for this model needs Processor.
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        return pipeline_test_casse_name == "ImageToTextPipelineTests"
+
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = copy.deepcopy(inputs_dict)
 
diff --git a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
index ab7a48694d..4eb4c7359f 100644
--- a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
+++ b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
@@ -34,6 +34,7 @@ from ...test_modeling_common import (
     ids_tensor,
     random_attention_mask,
 )
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -616,7 +617,9 @@ class SeamlessM4TModelWithSpeechInputTest(ModelTesterMixin, unittest.TestCase):
 
 
 @require_torch
-class SeamlessM4TModelWithTextInputTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class SeamlessM4TModelWithTextInputTest(
+    ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase
+):
     is_encoder_decoder = True
     fx_compatible = False
     test_missing_keys = False
@@ -636,6 +639,19 @@ class SeamlessM4TModelWithTextInputTest(ModelTesterMixin, GenerationTesterMixin,
         else ()
     )
     all_generative_model_classes = (SeamlessM4TForTextToText,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "automatic-speech-recognition": SeamlessM4TForSpeechToText,
+            "conversational": SeamlessM4TForTextToText,
+            "feature-extraction": SeamlessM4TModel,
+            "summarization": SeamlessM4TForTextToText,
+            "text-to-audio": SeamlessM4TForTextToSpeech,
+            "text2text-generation": SeamlessM4TForTextToText,
+            "translation": SeamlessM4TForTextToText,
+        }
+        if is_torch_available()
+        else {}
+    )
 
     def setUp(self):
         self.model_tester = SeamlessM4TModelTester(self, input_modality="text")
diff --git a/tests/models/swin2sr/test_modeling_swin2sr.py b/tests/models/swin2sr/test_modeling_swin2sr.py
index f94e11ad64..7306896036 100644
--- a/tests/models/swin2sr/test_modeling_swin2sr.py
+++ b/tests/models/swin2sr/test_modeling_swin2sr.py
@@ -162,7 +162,11 @@ class Swin2SRModelTester:
 @require_torch
 class Swin2SRModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (Swin2SRModel, Swin2SRForImageSuperResolution) if is_torch_available() else ()
-    pipeline_model_mapping = {"feature-extraction": Swin2SRModel} if is_torch_available() else {}
+    pipeline_model_mapping = (
+        {"feature-extraction": Swin2SRModel, "image-to-image": Swin2SRForImageSuperResolution}
+        if is_torch_available()
+        else {}
+    )
 
     fx_compatible = False
     test_pruning = False
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index f77d81d76e..6f01cfdac2 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -367,6 +367,7 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
             "audio-classification": WhisperForAudioClassification,
             "automatic-speech-recognition": WhisperForConditionalGeneration,
             "feature-extraction": WhisperModel,
+            "text-generation": WhisperForCausalLM,
         }
         if is_torch_available()
         else {}
diff --git a/tests/pipelines/test_pipelines_text_generation.py b/tests/pipelines/test_pipelines_text_generation.py
index b9a5febb56..dc77204f3e 100644
--- a/tests/pipelines/test_pipelines_text_generation.py
+++ b/tests/pipelines/test_pipelines_text_generation.py
@@ -242,7 +242,12 @@ class TextGenerationPipelineTests(unittest.TestCase):
         # We don't care about infinite range models.
         # They already work.
         # Skip this test for XGLM, since it uses sinusoidal positional embeddings which are resized on-the-fly.
-        EXTRA_MODELS_CAN_HANDLE_LONG_INPUTS = ["RwkvForCausalLM", "XGLMForCausalLM", "GPTNeoXForCausalLM"]
+        EXTRA_MODELS_CAN_HANDLE_LONG_INPUTS = [
+            "RwkvForCausalLM",
+            "XGLMForCausalLM",
+            "GPTNeoXForCausalLM",
+            "FuyuForCausalLM",
+        ]
         if (
             tokenizer.model_max_length < 10000
             and text_generator.model.__class__.__name__ not in EXTRA_MODELS_CAN_HANDLE_LONG_INPUTS
diff --git a/tests/utils/tiny_model_summary.json b/tests/utils/tiny_model_summary.json
index 2a1efa7d88..5f2c6c0b4e 100644
--- a/tests/utils/tiny_model_summary.json
+++ b/tests/utils/tiny_model_summary.json
@@ -877,6 +877,16 @@
         ],
         "sha": "a7874595b900f9b2ddc79130dafc3ff48f4fbfb9"
     },
+    "ClvpModelForConditionalGeneration": {
+        "tokenizer_classes": [
+            "ClvpTokenizer"
+        ],
+        "processor_classes": [
+            "ClvpFeatureExtractor"
+        ],
+        "model_classes": [],
+        "sha": "45df7581535be337ff781707b6c20994ca221f05"
+    },
     "CodeGenForCausalLM": {
         "tokenizer_classes": [
             "CodeGenTokenizer",
@@ -1039,7 +1049,8 @@
             "ConvNextImageProcessor"
         ],
         "model_classes": [
-            "ConvNextV2ForImageClassification"
+            "ConvNextV2ForImageClassification",
+            "TFConvNextV2ForImageClassification"
         ],
         "sha": "ee22bae1cbb87d66fc7f62f7e15a43d6ff80d3cc"
     },
@@ -1049,7 +1060,8 @@
             "ConvNextImageProcessor"
         ],
         "model_classes": [
-            "ConvNextV2Model"
+            "ConvNextV2Model",
+            "TFConvNextV2Model"
         ],
         "sha": "c4dd68ee1102cba05bcc483da2a88e39427b7249"
     },
@@ -2136,6 +2148,56 @@
         ],
         "sha": "683f6f73a2ab87801f1695a72d1af63cf173ab7c"
     },
+    "FalconForCausalLM": {
+        "tokenizer_classes": [
+            "PreTrainedTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FalconForCausalLM"
+        ],
+        "sha": "60076d5dafc5e33ba9c90dcd05e7c0834e44049a"
+    },
+    "FalconForQuestionAnswering": {
+        "tokenizer_classes": [
+            "PreTrainedTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FalconForQuestionAnswering"
+        ],
+        "sha": "b1ee9cd5fad2d177ea5a46df4611cd02f66ae788"
+    },
+    "FalconForSequenceClassification": {
+        "tokenizer_classes": [
+            "PreTrainedTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FalconForSequenceClassification"
+        ],
+        "sha": "007838c0991c2b6a87dc49a8a5c20f29149a00fa"
+    },
+    "FalconForTokenClassification": {
+        "tokenizer_classes": [
+            "PreTrainedTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FalconForTokenClassification"
+        ],
+        "sha": "0ea6ae548773daa6e3317fddc058957e956eebf4"
+    },
+    "FalconModel": {
+        "tokenizer_classes": [
+            "PreTrainedTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FalconModel"
+        ],
+        "sha": "ca15a579c946eb00c5b39cc8e0ea63d0c1460f84"
+    },
     "FlaubertForMultipleChoice": {
         "tokenizer_classes": [
             "FlaubertTokenizer"
@@ -2364,6 +2426,18 @@
         ],
         "sha": "bfbaa8fa21c3abf80b94e7168b5ecff8ec5b5f76"
     },
+    "FuyuForCausalLM": {
+        "tokenizer_classes": [
+            "LlamaTokenizerFast"
+        ],
+        "processor_classes": [
+            "FuyuImageProcessor"
+        ],
+        "model_classes": [
+            "FuyuForCausalLM"
+        ],
+        "sha": "685d78258ea95c5c82e0e4555d0d4a2270ab8bff"
+    },
     "GLPNForDepthEstimation": {
         "tokenizer_classes": [],
         "processor_classes": [
@@ -2866,6 +2940,30 @@
         ],
         "sha": "5a7983e48d5841704733dd0756177680ed50c074"
     },
+    "Kosmos2ForConditionalGeneration": {
+        "tokenizer_classes": [
+            "XLMRobertaTokenizerFast"
+        ],
+        "processor_classes": [
+            "CLIPImageProcessor"
+        ],
+        "model_classes": [
+            "Kosmos2ForConditionalGeneration"
+        ],
+        "sha": "d1d4607782b911411676f1ee79997dee645def58"
+    },
+    "Kosmos2Model": {
+        "tokenizer_classes": [
+            "XLMRobertaTokenizerFast"
+        ],
+        "processor_classes": [
+            "CLIPImageProcessor"
+        ],
+        "model_classes": [
+            "Kosmos2Model"
+        ],
+        "sha": "379d8944a65312094d9ab1c4b8a82058a2d3274e"
+    },
     "LEDForConditionalGeneration": {
         "tokenizer_classes": [
             "LEDTokenizer",
@@ -3820,6 +3918,39 @@
         ],
         "sha": "f197d5bfa1fe27b5f28a6e6d4e3ad229b753450a"
     },
+    "MistralForCausalLM": {
+        "tokenizer_classes": [
+            "LlamaTokenizer",
+            "LlamaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MistralForCausalLM"
+        ],
+        "sha": "f7e06aeedbba8f4f665b438b868ed932d451f64b"
+    },
+    "MistralForSequenceClassification": {
+        "tokenizer_classes": [
+            "LlamaTokenizer",
+            "LlamaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MistralForSequenceClassification"
+        ],
+        "sha": "65045444ea1933309270d8b08b21d3fa94a84290"
+    },
+    "MistralModel": {
+        "tokenizer_classes": [
+            "LlamaTokenizer",
+            "LlamaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MistralModel"
+        ],
+        "sha": "becd727ad72b1e8a7c0fa0ea39b61904fa68aeac"
+    },
     "MobileBertForMaskedLM": {
         "tokenizer_classes": [
             "MobileBertTokenizer",
@@ -4558,6 +4689,32 @@
         ],
         "sha": "f0e27b2b4e53ba70e05d13dcfea8e85272b292a5"
     },
+    "Owlv2ForObjectDetection": {
+        "tokenizer_classes": [
+            "CLIPTokenizer",
+            "CLIPTokenizerFast"
+        ],
+        "processor_classes": [
+            "Owlv2ImageProcessor"
+        ],
+        "model_classes": [
+            "Owlv2ForObjectDetection"
+        ],
+        "sha": "30439c0b2749726468dc13a755261e8101170052"
+    },
+    "Owlv2Model": {
+        "tokenizer_classes": [
+            "CLIPTokenizer",
+            "CLIPTokenizerFast"
+        ],
+        "processor_classes": [
+            "Owlv2ImageProcessor"
+        ],
+        "model_classes": [
+            "Owlv2Model"
+        ],
+        "sha": "7aeebdad5f72b36cb07c74355afad8e6052e2377"
+    },
     "PLBartForCausalLM": {
         "tokenizer_classes": [
             "PLBartTokenizer"
@@ -4760,6 +4917,50 @@
         ],
         "sha": "b8c8d479e29e9ee048e2d0b05b001ac835ad8859"
     },
+    "PhiForCausalLM": {
+        "tokenizer_classes": [
+            "CodeGenTokenizer",
+            "CodeGenTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "PhiForCausalLM"
+        ],
+        "sha": "3fecc0109a4a3a230e3a5509eaf47a26eba85d79"
+    },
+    "PhiForSequenceClassification": {
+        "tokenizer_classes": [
+            "CodeGenTokenizer",
+            "CodeGenTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "PhiForSequenceClassification"
+        ],
+        "sha": "e1c9f8ebf1317516acc1cd6338de71a53e770245"
+    },
+    "PhiForTokenClassification": {
+        "tokenizer_classes": [
+            "CodeGenTokenizer",
+            "CodeGenTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "PhiForTokenClassification"
+        ],
+        "sha": "d3a8054903753b5c96c05eaf9877905a116a1d5e"
+    },
+    "PhiModel": {
+        "tokenizer_classes": [
+            "CodeGenTokenizer",
+            "CodeGenTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "PhiModel"
+        ],
+        "sha": "99c38d5ce7ace35127d00ed3eeb3561308ea6b21"
+    },
     "Pix2StructForConditionalGeneration": {
         "tokenizer_classes": [
             "T5TokenizerFast"
@@ -4768,7 +4969,9 @@
             "Pix2StructImageProcessor",
             "Pix2StructProcessor"
         ],
-        "model_classes": [],
+        "model_classes": [
+            "Pix2StructForConditionalGeneration"
+        ],
         "sha": "42b3de00ad535076c4893e4ac5ae2d2748cc4ccb"
     },
     "PoolFormerForImageClassification": {
@@ -5691,6 +5894,16 @@
         ],
         "sha": "25ba2d88c770533f8c69811d2a454a00c1d09f5d"
     },
+    "Swin2SRForImageSuperResolution": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "Swin2SRImageProcessor"
+        ],
+        "model_classes": [
+            "Swin2SRForImageSuperResolution"
+        ],
+        "sha": "3a2780de0b455084c018ac8a62b56040969e26ec"
+    },
     "Swin2SRModel": {
         "tokenizer_classes": [],
         "processor_classes": [
@@ -6625,6 +6838,18 @@
         ],
         "sha": "d71b13674b1a67443cd19d0594a3b5b1e5968f0d"
     },
+    "WhisperForCausalLM": {
+        "tokenizer_classes": [
+            "WhisperTokenizer"
+        ],
+        "processor_classes": [
+            "WhisperFeatureExtractor"
+        ],
+        "model_classes": [
+            "WhisperForCausalLM"
+        ],
+        "sha": "e7febfd7f4512e029293c677e6d2633e23fc459a"
+    },
     "WhisperForConditionalGeneration": {
         "tokenizer_classes": [
             "WhisperTokenizer",