From 102370544044e020cc435f96dcada668e35d7aea Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Fri, 14 Jul 2023 14:43:19 -0400
Subject: [PATCH] Check models used for common tests are small (#24824)

* First models

* Conditional DETR

* Treat DETR models, skip others

* Skip LayoutLMv2 as well

* Fix last tests
---
 .../configuration_conditional_detr.py         | 15 ++++-
 .../modeling_conditional_detr.py              | 17 +++---
 .../configuration_deformable_detr.py          | 14 +++++
 src/transformers/models/detr/modeling_detr.py |  8 +--
 .../bridgetower/test_modeling_bridgetower.py  | 16 +++--
 tests/models/canine/test_modeling_canine.py   |  4 ++
 tests/models/clap/test_modeling_clap.py       | 13 +++--
 .../test_modeling_conditional_detr.py         | 50 ++++++----------
 tests/models/ctrl/test_modeling_ctrl.py       |  4 ++
 tests/models/cvt/test_modeling_cvt.py         |  4 ++
 .../test_modeling_deformable_detr.py          | 58 +++++++++----------
 tests/models/deta/test_modeling_deta.py       |  4 ++
 tests/models/detr/test_modeling_detr.py       | 50 +++++++---------
 tests/models/dpt/test_modeling_dpt.py         |  4 ++
 tests/models/dpt/test_modeling_dpt_hybrid.py  |  4 ++
 .../test_modeling_efficientnet.py             |  4 ++
 tests/models/encodec/test_modeling_encodec.py |  4 ++
 tests/models/esm/test_modeling_esm.py         |  4 ++
 tests/models/esm/test_modeling_esmfold.py     |  4 ++
 tests/models/flava/test_modeling_flava.py     | 20 +++++++
 tests/models/git/test_modeling_git.py         |  4 ++
 .../test_modeling_gptsan_japanese.py          |  8 +++
 .../graphormer/test_modeling_graphormer.py    |  4 ++
 .../models/layoutlm/test_modeling_layoutlm.py |  4 ++
 .../layoutlmv2/test_modeling_layoutlmv2.py    |  4 ++
 tests/models/levit/test_modeling_levit.py     |  4 ++
 .../mask2former/test_modeling_mask2former.py  |  4 ++
 .../maskformer/test_modeling_maskformer.py    |  4 ++
 .../mobilevit/test_modeling_mobilevit.py      |  4 ++
 .../mobilevitv2/test_modeling_mobilevitv2.py  |  4 ++
 .../oneformer/test_modeling_oneformer.py      |  4 ++
 .../perceiver/test_modeling_perceiver.py      |  4 ++
 .../segformer/test_modeling_segformer.py      |  4 ++
 .../models/speecht5/test_modeling_speecht5.py | 20 +++++++
 .../swiftformer/test_modeling_swiftformer.py  |  4 ++
 .../test_modeling_table_transformer.py        |  4 ++
 .../test_modeling_timm_backbone.py            |  4 ++
 tests/models/tvlt/test_modeling_tvlt.py       |  4 ++
 tests/models/upernet/test_modeling_upernet.py |  4 ++
 .../models/videomae/test_modeling_videomae.py |  4 ++
 tests/models/vit_mae/test_modeling_vit_mae.py |  4 ++
 tests/models/vivit/test_modeling_vivit.py     |  4 ++
 tests/test_modeling_common.py                 | 12 ++++
 43 files changed, 305 insertions(+), 116 deletions(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index ec04f0a523..9ce6179a19 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Conditional DETR model configuration"""
-
+import copy
 from collections import OrderedDict
 from typing import Mapping
 
@@ -238,6 +238,19 @@ class ConditionalDetrConfig(PretrainedConfig):
     def hidden_size(self) -> int:
         return self.d_model
 
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        if self.backbone_config is not None:
+            output["backbone_config"] = self.backbone_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
+
 
 class ConditionalDetrOnnxConfig(OnnxConfig):
     torch_onnx_minimum_version = version.parse("1.11")
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index fc36732cc4..f824b52430 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -499,10 +499,11 @@ def build_position_encoding(config):
 
 
 # function to generate sine positional embedding for 2d coordinates
-def gen_sine_position_embeddings(pos_tensor):
+def gen_sine_position_embeddings(pos_tensor, d_model):
     scale = 2 * math.pi
-    dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
-    dim_t = 10000 ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / 128)
+    dim = d_model // 2
+    dim_t = torch.arange(dim, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = 10000 ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / dim)
     x_embed = pos_tensor[:, :, 0] * scale
     y_embed = pos_tensor[:, :, 1] * scale
     pos_x = x_embed[:, :, None] / dim_t
@@ -1376,7 +1377,7 @@ class ConditionalDetrDecoder(ConditionalDetrPreTrainedModel):
         reference_points = reference_points_before_sigmoid.sigmoid().transpose(0, 1)
         obj_center = reference_points[..., :2].transpose(0, 1)
         # get sine embedding for the query vector
-        query_sine_embed_before_transformation = gen_sine_position_embeddings(obj_center)
+        query_sine_embed_before_transformation = gen_sine_position_embeddings(obj_center, self.config.d_model)
 
         for idx, decoder_layer in enumerate(self.layers):
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
@@ -2093,13 +2094,13 @@ class ConditionalDetrMaskHeadSmallConv(nn.Module):
         self.lay1 = nn.Conv2d(dim, dim, 3, padding=1)
         self.gn1 = nn.GroupNorm(8, dim)
         self.lay2 = nn.Conv2d(dim, inter_dims[1], 3, padding=1)
-        self.gn2 = nn.GroupNorm(8, inter_dims[1])
+        self.gn2 = nn.GroupNorm(min(8, inter_dims[1]), inter_dims[1])
         self.lay3 = nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1)
-        self.gn3 = nn.GroupNorm(8, inter_dims[2])
+        self.gn3 = nn.GroupNorm(min(8, inter_dims[2]), inter_dims[2])
         self.lay4 = nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1)
-        self.gn4 = nn.GroupNorm(8, inter_dims[3])
+        self.gn4 = nn.GroupNorm(min(8, inter_dims[3]), inter_dims[3])
         self.lay5 = nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1)
-        self.gn5 = nn.GroupNorm(8, inter_dims[4])
+        self.gn5 = nn.GroupNorm(min(8, inter_dims[4]), inter_dims[4])
         self.out_lay = nn.Conv2d(inter_dims[4], 1, 3, padding=1)
 
         self.dim = dim
diff --git a/src/transformers/models/deformable_detr/configuration_deformable_detr.py b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
index dbe5fd7f0a..be4634477d 100644
--- a/src/transformers/models/deformable_detr/configuration_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Deformable DETR model configuration"""
+import copy
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -260,3 +261,16 @@ class DeformableDetrConfig(PretrainedConfig):
     @property
     def hidden_size(self) -> int:
         return self.d_model
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        if self.backbone_config is not None:
+            output["backbone_config"] = self.backbone_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index 3476e1e2e7..ec47c9bb4f 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -1791,13 +1791,13 @@ class DetrMaskHeadSmallConv(nn.Module):
         self.lay1 = nn.Conv2d(dim, dim, 3, padding=1)
         self.gn1 = nn.GroupNorm(8, dim)
         self.lay2 = nn.Conv2d(dim, inter_dims[1], 3, padding=1)
-        self.gn2 = nn.GroupNorm(8, inter_dims[1])
+        self.gn2 = nn.GroupNorm(min(8, inter_dims[1]), inter_dims[1])
         self.lay3 = nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1)
-        self.gn3 = nn.GroupNorm(8, inter_dims[2])
+        self.gn3 = nn.GroupNorm(min(8, inter_dims[2]), inter_dims[2])
         self.lay4 = nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1)
-        self.gn4 = nn.GroupNorm(8, inter_dims[3])
+        self.gn4 = nn.GroupNorm(min(8, inter_dims[3]), inter_dims[3])
         self.lay5 = nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1)
-        self.gn5 = nn.GroupNorm(8, inter_dims[4])
+        self.gn5 = nn.GroupNorm(min(8, inter_dims[4]), inter_dims[4])
         self.out_lay = nn.Conv2d(inter_dims[4], 1, 3, padding=1)
 
         self.dim = dim
diff --git a/tests/models/bridgetower/test_modeling_bridgetower.py b/tests/models/bridgetower/test_modeling_bridgetower.py
index 0eb293fdc3..8c7bd00ee6 100644
--- a/tests/models/bridgetower/test_modeling_bridgetower.py
+++ b/tests/models/bridgetower/test_modeling_bridgetower.py
@@ -62,12 +62,12 @@ class BridgeTowerTextModelTester:
         self,
         parent,
         hidden_act="gelu",
-        hidden_size=128,
+        hidden_size=64,
         initializer_factor=1,
         layer_norm_eps=1e-05,
         num_attention_heads=4,
         num_hidden_layers=2,
-        intermediate_size=256,
+        intermediate_size=128,
         tie_word_embeddings=False,
         output_hidden_states=False,
     ):
@@ -105,6 +105,7 @@ class BridgeTowerTextModelTester:
             intermediate_size=self.intermediate_size,
             tie_word_embeddings=self.tie_word_embeddings,
             output_hidden_states=self.output_hidden_states,
+            vocab_size=self.vocab_size,
         )
 
 
@@ -112,7 +113,7 @@ class BridgeTowerImageModelTester:
     def __init__(
         self,
         parent,
-        hidden_size=128,
+        hidden_size=64,
         initializer_factor=1,
         layer_norm_eps=1e-05,
         num_hidden_layers=2,
@@ -168,10 +169,10 @@ class BridgeTowerModelTester:
         init_layernorm_from_vision_encoder=False,
         contrastive_hidden_size=512,
         logit_scale_init_value=2.6592,
-        hidden_size=128,
+        hidden_size=64,
         num_hidden_layers=2,
         num_attention_heads=4,
-        intermediate_size=256,
+        intermediate_size=128,
     ):
         if text_kwargs is None:
             text_kwargs = {}
@@ -280,7 +281,10 @@ class BridgeTowerModelTester:
         result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values, pixel_mask=pixel_mask)
         result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
 
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.text_model_tester.seq_length, 50265))
+        self.parent.assertEqual(
+            result.logits.shape,
+            (self.batch_size, self.text_model_tester.seq_length, self.text_model_tester.vocab_size),
+        )
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
diff --git a/tests/models/canine/test_modeling_canine.py b/tests/models/canine/test_modeling_canine.py
index d612a02bf4..057fc09131 100644
--- a/tests/models/canine/test_modeling_canine.py
+++ b/tests/models/canine/test_modeling_canine.py
@@ -50,6 +50,7 @@ class CanineModelTester:
         use_token_type_ids=True,
         use_labels=True,
         # let's use a vocab size that's way bigger than BERT's one
+        # NOTE: this is not a model parameter, just an input
         vocab_size=100000,
         hidden_size=32,
         num_hidden_layers=5,
@@ -64,6 +65,7 @@ class CanineModelTester:
         initializer_range=0.02,
         num_labels=3,
         num_choices=4,
+        num_hash_buckets=16,
         scope=None,
     ):
         self.parent = parent
@@ -87,6 +89,7 @@ class CanineModelTester:
         self.initializer_range = initializer_range
         self.num_labels = num_labels
         self.num_choices = num_choices
+        self.num_hash_buckets = num_hash_buckets
         self.scope = scope
 
     def prepare_config_and_inputs(self):
@@ -125,6 +128,7 @@ class CanineModelTester:
             type_vocab_size=self.type_vocab_size,
             is_decoder=False,
             initializer_range=self.initializer_range,
+            num_hash_buckets=self.num_hash_buckets,
         )
 
     def create_and_check_model(
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index 53b4082c68..35c0f4e203 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -67,11 +67,12 @@ class ClapAudioModelTester:
         freq_ratio=2,
         num_channels=3,
         is_training=True,
-        hidden_size=256,
-        patch_embeds_hidden_size=32,
+        hidden_size=32,
+        patch_embeds_hidden_size=16,
         projection_dim=32,
-        num_hidden_layers=4,
-        num_heads=[2, 2, 2, 2],
+        depths=[2, 2],
+        num_hidden_layers=2,
+        num_heads=[2, 2],
         intermediate_size=37,
         dropout=0.1,
         attention_dropout=0.1,
@@ -89,6 +90,7 @@ class ClapAudioModelTester:
         self.hidden_size = hidden_size
         self.projection_dim = projection_dim
         self.num_hidden_layers = num_hidden_layers
+        self.depths = depths
         self.num_heads = num_heads
         self.num_attention_heads = num_heads[0]
         self.seq_length = seq_length
@@ -118,6 +120,7 @@ class ClapAudioModelTester:
             hidden_size=self.hidden_size,
             patch_stride=self.patch_stride,
             projection_dim=self.projection_dim,
+            depths=self.depths,
             num_hidden_layers=self.num_hidden_layers,
             num_attention_heads=self.num_heads,
             intermediate_size=self.intermediate_size,
@@ -203,7 +206,7 @@ class ClapAudioModelTest(ModelTesterMixin, unittest.TestCase):
 
             self.assertListEqual(
                 list(hidden_states[0].shape[-2:]),
-                [self.model_tester.patch_embeds_hidden_size, self.model_tester.patch_embeds_hidden_size],
+                [2 * self.model_tester.patch_embeds_hidden_size, 2 * self.model_tester.patch_embeds_hidden_size],
             )
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index e1827f0213..10d788bd69 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -19,8 +19,8 @@ import inspect
 import math
 import unittest
 
-from transformers import ConditionalDetrConfig, is_timm_available, is_vision_available
-from transformers.testing_utils import require_timm, require_vision, slow, torch_device
+from transformers import ConditionalDetrConfig, ResNetConfig, is_torch_available, is_vision_available
+from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property
 
 from ...generation.test_utils import GenerationTesterMixin
@@ -29,14 +29,13 @@ from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
-if is_timm_available():
+if is_torch_available():
     import torch
 
     from transformers import (
         ConditionalDetrForObjectDetection,
         ConditionalDetrForSegmentation,
         ConditionalDetrModel,
-        ResNetConfig,
     )
 
 
@@ -53,7 +52,7 @@ class ConditionalDetrModelTester:
         batch_size=8,
         is_training=True,
         use_labels=True,
-        hidden_size=256,
+        hidden_size=32,
         num_hidden_layers=2,
         num_attention_heads=8,
         intermediate_size=4,
@@ -111,6 +110,16 @@ class ConditionalDetrModelTester:
         return config, pixel_values, pixel_mask, labels
 
     def get_config(self):
+        resnet_config = ResNetConfig(
+            num_channels=3,
+            embeddings_size=10,
+            hidden_sizes=[10, 20, 30, 40],
+            depths=[1, 1, 2, 1],
+            hidden_act="relu",
+            num_labels=3,
+            out_features=["stage2", "stage3", "stage4"],
+            out_indices=[2, 3, 4],
+        )
         return ConditionalDetrConfig(
             d_model=self.hidden_size,
             encoder_layers=self.num_hidden_layers,
@@ -123,6 +132,8 @@ class ConditionalDetrModelTester:
             attention_dropout=self.attention_probs_dropout_prob,
             num_queries=self.num_queries,
             num_labels=self.num_labels,
+            use_timm_backbone=False,
+            backbone_config=resnet_config,
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -159,27 +170,8 @@ class ConditionalDetrModelTester:
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
         self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
 
-    def create_and_check_no_timm_backbone(self, config, pixel_values, pixel_mask, labels):
-        config.use_timm_backbone = False
-        config.backbone_config = ResNetConfig()
-        model = ConditionalDetrForObjectDetection(config=config)
-        model.to(torch_device)
-        model.eval()
 
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(pixel_values)
-
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
-        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
-
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
-        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
-
-
-@require_timm
+@require_torch
 class ConditionalDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
@@ -187,12 +179,12 @@ class ConditionalDetrModelTest(ModelTesterMixin, GenerationTesterMixin, Pipeline
             ConditionalDetrForObjectDetection,
             ConditionalDetrForSegmentation,
         )
-        if is_timm_available()
+        if is_torch_available()
         else ()
     )
     pipeline_model_mapping = (
         {"feature-extraction": ConditionalDetrModel, "object-detection": ConditionalDetrForObjectDetection}
-        if is_timm_available()
+        if is_torch_available()
         else {}
     )
     is_encoder_decoder = True
@@ -243,10 +235,6 @@ class ConditionalDetrModelTest(ModelTesterMixin, GenerationTesterMixin, Pipeline
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_conditional_detr_object_detection_head_model(*config_and_inputs)
 
-    def test_conditional_detr_no_timm_backbone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_no_timm_backbone(*config_and_inputs)
-
     # TODO: check if this works again for PyTorch 2.x.y
     @unittest.skip(reason="Got `CUDA error: misaligned address` with PyTorch 2.0.0.")
     def test_multi_gpu_data_parallel_forward(self):
diff --git a/tests/models/ctrl/test_modeling_ctrl.py b/tests/models/ctrl/test_modeling_ctrl.py
index 8941927f17..fd20277239 100644
--- a/tests/models/ctrl/test_modeling_ctrl.py
+++ b/tests/models/ctrl/test_modeling_ctrl.py
@@ -243,6 +243,10 @@ class CTRLModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in CTRL_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/cvt/test_modeling_cvt.py b/tests/models/cvt/test_modeling_cvt.py
index e7fc756ea8..ab37e47b51 100644
--- a/tests/models/cvt/test_modeling_cvt.py
+++ b/tests/models/cvt/test_modeling_cvt.py
@@ -247,6 +247,10 @@ class CvtModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in CVT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py
index 180f57f042..1864d054fd 100644
--- a/tests/models/deformable_detr/test_modeling_deformable_detr.py
+++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py
@@ -20,9 +20,16 @@ import math
 import unittest
 from typing import Dict, List, Tuple
 
-from transformers import DeformableDetrConfig, is_timm_available, is_vision_available
+from transformers import DeformableDetrConfig, ResNetConfig, is_torch_available, is_vision_available
 from transformers.file_utils import cached_property
-from transformers.testing_utils import require_timm, require_torch_gpu, require_vision, slow, torch_device
+from transformers.testing_utils import (
+    require_timm,
+    require_torch,
+    require_torch_gpu,
+    require_vision,
+    slow,
+    torch_device,
+)
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -30,10 +37,10 @@ from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
-if is_timm_available():
+if is_torch_available():
     import torch
 
-    from transformers import DeformableDetrForObjectDetection, DeformableDetrModel, ResNetConfig
+    from transformers import DeformableDetrForObjectDetection, DeformableDetrModel
 
 
 if is_vision_available():
@@ -49,7 +56,7 @@ class DeformableDetrModelTester:
         batch_size=8,
         is_training=True,
         use_labels=True,
-        hidden_size=256,
+        hidden_size=32,
         num_hidden_layers=2,
         num_attention_heads=8,
         intermediate_size=4,
@@ -116,6 +123,16 @@ class DeformableDetrModelTester:
         return config, pixel_values, pixel_mask, labels
 
     def get_config(self):
+        resnet_config = ResNetConfig(
+            num_channels=3,
+            embeddings_size=10,
+            hidden_sizes=[10, 20, 30, 40],
+            depths=[1, 1, 2, 1],
+            hidden_act="relu",
+            num_labels=3,
+            out_features=["stage2", "stage3", "stage4"],
+            out_indices=[2, 3, 4],
+        )
         return DeformableDetrConfig(
             d_model=self.hidden_size,
             encoder_layers=self.num_hidden_layers,
@@ -131,6 +148,8 @@ class DeformableDetrModelTester:
             num_feature_levels=self.num_feature_levels,
             encoder_n_points=self.encoder_n_points,
             decoder_n_points=self.decoder_n_points,
+            use_timm_backbone=False,
+            backbone_config=resnet_config,
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -165,32 +184,13 @@ class DeformableDetrModelTester:
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
         self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
 
-    def create_and_check_no_timm_backbone(self, config, pixel_values, pixel_mask, labels):
-        config.use_timm_backbone = False
-        config.backbone_config = ResNetConfig()
-        model = DeformableDetrForObjectDetection(config=config)
-        model.to(torch_device)
-        model.eval()
 
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(pixel_values)
-
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
-        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
-
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
-        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
-
-
-@require_timm
+@require_torch
 class DeformableDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (DeformableDetrModel, DeformableDetrForObjectDetection) if is_timm_available() else ()
+    all_model_classes = (DeformableDetrModel, DeformableDetrForObjectDetection) if is_torch_available() else ()
     pipeline_model_mapping = (
         {"feature-extraction": DeformableDetrModel, "object-detection": DeformableDetrForObjectDetection}
-        if is_timm_available()
+        if is_torch_available()
         else {}
     )
     is_encoder_decoder = True
@@ -246,10 +246,6 @@ class DeformableDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineT
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_deformable_detr_object_detection_head_model(*config_and_inputs)
 
-    def test_deformable_detr_no_timm_backbone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_no_timm_backbone(*config_and_inputs)
-
     @unittest.skip(reason="Deformable DETR does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
diff --git a/tests/models/deta/test_modeling_deta.py b/tests/models/deta/test_modeling_deta.py
index b1d54a87de..0693b030e2 100644
--- a/tests/models/deta/test_modeling_deta.py
+++ b/tests/models/deta/test_modeling_deta.py
@@ -423,6 +423,10 @@ class DetaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
     def test_tied_model_weights_key_ignore(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py
index f993a96dcb..abede6fa14 100644
--- a/tests/models/detr/test_modeling_detr.py
+++ b/tests/models/detr/test_modeling_detr.py
@@ -19,7 +19,7 @@ import inspect
 import math
 import unittest
 
-from transformers import DetrConfig, is_timm_available, is_vision_available
+from transformers import DetrConfig, ResNetConfig, is_torch_available, is_vision_available
 from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property
 
@@ -29,10 +29,10 @@ from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
-if is_timm_available():
+if is_torch_available():
     import torch
 
-    from transformers import DetrForObjectDetection, DetrForSegmentation, DetrModel, ResNetConfig
+    from transformers import DetrForObjectDetection, DetrForSegmentation, DetrModel
 
 
 if is_vision_available():
@@ -48,7 +48,7 @@ class DetrModelTester:
         batch_size=8,
         is_training=True,
         use_labels=True,
-        hidden_size=256,
+        hidden_size=32,
         num_hidden_layers=2,
         num_attention_heads=8,
         intermediate_size=4,
@@ -106,6 +106,16 @@ class DetrModelTester:
         return config, pixel_values, pixel_mask, labels
 
     def get_config(self):
+        resnet_config = ResNetConfig(
+            num_channels=3,
+            embeddings_size=10,
+            hidden_sizes=[10, 20, 30, 40],
+            depths=[1, 1, 2, 1],
+            hidden_act="relu",
+            num_labels=3,
+            out_features=["stage2", "stage3", "stage4"],
+            out_indices=[2, 3, 4],
+        )
         return DetrConfig(
             d_model=self.hidden_size,
             encoder_layers=self.num_hidden_layers,
@@ -118,6 +128,8 @@ class DetrModelTester:
             attention_dropout=self.attention_probs_dropout_prob,
             num_queries=self.num_queries,
             num_labels=self.num_labels,
+            use_timm_backbone=False,
+            backbone_config=resnet_config,
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -154,27 +166,8 @@ class DetrModelTester:
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1))
         self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
 
-    def create_and_check_no_timm_backbone(self, config, pixel_values, pixel_mask, labels):
-        config.use_timm_backbone = False
-        config.backbone_config = ResNetConfig()
-        model = DetrForObjectDetection(config=config)
-        model.to(torch_device)
-        model.eval()
 
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(pixel_values)
-
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1))
-        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
-
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1))
-        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
-
-
-@require_timm
+@require_torch
 class DetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
@@ -182,7 +175,7 @@ class DetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
             DetrForObjectDetection,
             DetrForSegmentation,
         )
-        if is_timm_available()
+        if is_torch_available()
         else ()
     )
     pipeline_model_mapping = (
@@ -191,7 +184,7 @@ class DetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
             "image-segmentation": DetrForSegmentation,
             "object-detection": DetrForObjectDetection,
         }
-        if is_timm_available()
+        if is_torch_available()
         else {}
     )
     is_encoder_decoder = True
@@ -242,10 +235,6 @@ class DetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_detr_object_detection_head_model(*config_and_inputs)
 
-    def test_detr_no_timm_backbone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_no_timm_backbone(*config_and_inputs)
-
     # TODO: check if this works again for PyTorch 2.x.y
     @unittest.skip(reason="Got `CUDA error: misaligned address` with PyTorch 2.0.0.")
     def test_multi_gpu_data_parallel_forward(self):
@@ -464,6 +453,7 @@ class DetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
 
         # let's set num_channels to 1
         config.num_channels = 1
+        config.backbone_config.num_channels = 1
 
         for model_class in self.all_model_classes:
             model = model_class(config)
diff --git a/tests/models/dpt/test_modeling_dpt.py b/tests/models/dpt/test_modeling_dpt.py
index 31e6cf221d..f9cb66607d 100644
--- a/tests/models/dpt/test_modeling_dpt.py
+++ b/tests/models/dpt/test_modeling_dpt.py
@@ -275,6 +275,10 @@ class DPTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
                         msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                     )
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in DPT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/dpt/test_modeling_dpt_hybrid.py b/tests/models/dpt/test_modeling_dpt_hybrid.py
index 68d600f33f..4c32c76e86 100644
--- a/tests/models/dpt/test_modeling_dpt_hybrid.py
+++ b/tests/models/dpt/test_modeling_dpt_hybrid.py
@@ -289,6 +289,10 @@ class DPTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
                         msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                     )
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in DPT_PRETRAINED_MODEL_ARCHIVE_LIST[1:]:
diff --git a/tests/models/efficientnet/test_modeling_efficientnet.py b/tests/models/efficientnet/test_modeling_efficientnet.py
index 11babddac2..e77c17a7a6 100644
--- a/tests/models/efficientnet/test_modeling_efficientnet.py
+++ b/tests/models/efficientnet/test_modeling_efficientnet.py
@@ -223,6 +223,10 @@ class EfficientNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Test
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in EFFICIENTNET_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py
index 389444d4b2..b888331461 100644
--- a/tests/models/encodec/test_modeling_encodec.py
+++ b/tests/models/encodec/test_modeling_encodec.py
@@ -397,6 +397,10 @@ class EncodecModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     def test_identity_shortcut(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs()
         config.use_conv_shortcut = False
diff --git a/tests/models/esm/test_modeling_esm.py b/tests/models/esm/test_modeling_esm.py
index 2e5d48082b..fc1879e6bf 100644
--- a/tests/models/esm/test_modeling_esm.py
+++ b/tests/models/esm/test_modeling_esm.py
@@ -279,6 +279,10 @@ class EsmModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     def test_resize_tokens_embeddings(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
 
 @require_torch
 class EsmModelIntegrationTest(TestCasePlus):
diff --git a/tests/models/esm/test_modeling_esmfold.py b/tests/models/esm/test_modeling_esmfold.py
index 84444f3222..bc5c10ae24 100644
--- a/tests/models/esm/test_modeling_esmfold.py
+++ b/tests/models/esm/test_modeling_esmfold.py
@@ -243,6 +243,10 @@ class EsmFoldModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
 
 @require_torch
 class EsmModelIntegrationTest(TestCasePlus):
diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py
index e69d2226c8..cef39224da 100644
--- a/tests/models/flava/test_modeling_flava.py
+++ b/tests/models/flava/test_modeling_flava.py
@@ -321,6 +321,10 @@ class FlavaImageModelTest(ModelTesterMixin, unittest.TestCase):
     def test_save_load_fast_init_to_base(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in FLAVA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -472,6 +476,10 @@ class FlavaTextModelTest(ModelTesterMixin, unittest.TestCase):
     def test_save_load_fast_init_to_base(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in FLAVA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -624,6 +632,10 @@ class FlavaMultimodalModelTest(ModelTesterMixin, unittest.TestCase):
     def test_save_load_fast_init_to_base(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in FLAVA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -731,6 +743,10 @@ class FlavaImageCodebookTest(ModelTesterMixin, unittest.TestCase):
     def test_save_load_fast_init_to_base(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in FLAVA_CODEBOOK_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -913,6 +929,10 @@ class FlavaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
             return
diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py
index 1c39b61e47..5997230b16 100644
--- a/tests/models/git/test_modeling_git.py
+++ b/tests/models/git/test_modeling_git.py
@@ -454,6 +454,10 @@ class GitModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
     def test_greedy_generate_dict_outputs_use_cache(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
 
 @require_torch
 @require_vision
diff --git a/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py b/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
index d00c755f3c..0738b294c0 100644
--- a/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
+++ b/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
@@ -182,6 +182,10 @@ class GPTSanJapaneseTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
     def test_model_parallelism(self):
         super().test_model_parallelism()
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
 
 @require_torch
 class GPTSanJapaneseForConditionalGenerationTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
@@ -212,6 +216,10 @@ class GPTSanJapaneseForConditionalGenerationTest(ModelTesterMixin, GenerationTes
     def test_model_parallelism(self):
         super().test_model_parallelism()
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_logits(self):
         model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
diff --git a/tests/models/graphormer/test_modeling_graphormer.py b/tests/models/graphormer/test_modeling_graphormer.py
index bc860e1a8a..60f188f3b2 100644
--- a/tests/models/graphormer/test_modeling_graphormer.py
+++ b/tests/models/graphormer/test_modeling_graphormer.py
@@ -470,6 +470,10 @@ class GraphormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_graph_classification(*config_and_inputs)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in GRAPHORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/layoutlm/test_modeling_layoutlm.py b/tests/models/layoutlm/test_modeling_layoutlm.py
index 0535fbf4e1..687d1ae4a5 100644
--- a/tests/models/layoutlm/test_modeling_layoutlm.py
+++ b/tests/models/layoutlm/test_modeling_layoutlm.py
@@ -279,6 +279,10 @@ class LayoutLMModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
 
 def prepare_layoutlm_batch_inputs():
     # Here we prepare a batch of 2 sequences to test a LayoutLM forward pass on:
diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
index 7d2f35c8b9..4eda8952c3 100644
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -415,6 +415,10 @@ class LayoutLMv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in LAYOUTLMV2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/levit/test_modeling_levit.py b/tests/models/levit/test_modeling_levit.py
index 4f78527514..b78554374e 100644
--- a/tests/models/levit/test_modeling_levit.py
+++ b/tests/models/levit/test_modeling_levit.py
@@ -282,6 +282,10 @@ class LevitModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
diff --git a/tests/models/mask2former/test_modeling_mask2former.py b/tests/models/mask2former/test_modeling_mask2former.py
index 7ec44c0713..c492bbb766 100644
--- a/tests/models/mask2former/test_modeling_mask2former.py
+++ b/tests/models/mask2former/test_modeling_mask2former.py
@@ -220,6 +220,10 @@ class Mask2FormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/maskformer/test_modeling_maskformer.py b/tests/models/maskformer/test_modeling_maskformer.py
index 81c720a0b5..69cf21d566 100644
--- a/tests/models/maskformer/test_modeling_maskformer.py
+++ b/tests/models/maskformer/test_modeling_maskformer.py
@@ -224,6 +224,10 @@ class MaskFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/mobilevit/test_modeling_mobilevit.py b/tests/models/mobilevit/test_modeling_mobilevit.py
index c0ad5a7744..350934ad05 100644
--- a/tests/models/mobilevit/test_modeling_mobilevit.py
+++ b/tests/models/mobilevit/test_modeling_mobilevit.py
@@ -231,6 +231,10 @@ class MobileViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
             expected_arg_names = ["pixel_values"]
             self.assertListEqual(arg_names[:1], expected_arg_names)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/mobilevitv2/test_modeling_mobilevitv2.py b/tests/models/mobilevitv2/test_modeling_mobilevitv2.py
index 0e55e6babe..7f5c332a61 100644
--- a/tests/models/mobilevitv2/test_modeling_mobilevitv2.py
+++ b/tests/models/mobilevitv2/test_modeling_mobilevitv2.py
@@ -225,6 +225,10 @@ class MobileViTV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/oneformer/test_modeling_oneformer.py b/tests/models/oneformer/test_modeling_oneformer.py
index dc25a2b48f..f23c0f265c 100644
--- a/tests/models/oneformer/test_modeling_oneformer.py
+++ b/tests/models/oneformer/test_modeling_oneformer.py
@@ -309,6 +309,10 @@ class OneFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
             expected_arg_names = ["pixel_values", "task_inputs"]
             self.assertListEqual(arg_names[:2], expected_arg_names)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in ["shi-labs/oneformer_ade20k_swin_tiny"]:
diff --git a/tests/models/perceiver/test_modeling_perceiver.py b/tests/models/perceiver/test_modeling_perceiver.py
index 64ad560d7d..23bd75bdd1 100644
--- a/tests/models/perceiver/test_modeling_perceiver.py
+++ b/tests/models/perceiver/test_modeling_perceiver.py
@@ -784,6 +784,10 @@ class PerceiverModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
 
                     loss.backward()
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @require_torch_multi_gpu
     @unittest.skip(
         reason=(
diff --git a/tests/models/segformer/test_modeling_segformer.py b/tests/models/segformer/test_modeling_segformer.py
index 03706ca96e..7f19001148 100644
--- a/tests/models/segformer/test_modeling_segformer.py
+++ b/tests/models/segformer/test_modeling_segformer.py
@@ -347,6 +347,10 @@ class SegformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
             loss = model(**inputs).loss
             loss.backward()
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py
index 7532f132a7..fbd09ef500 100644
--- a/tests/models/speecht5/test_modeling_speecht5.py
+++ b/tests/models/speecht5/test_modeling_speecht5.py
@@ -238,6 +238,10 @@ class SpeechT5ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
         # disabled because this model doesn't have decoder_input_ids
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
 
 @require_torch
 class SpeechT5ForSpeechToTextTester:
@@ -701,6 +705,10 @@ class SpeechT5ForSpeechToTextTest(ModelTesterMixin, unittest.TestCase):
     def test_training_gradient_checkpointing(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     # overwrite from test_modeling_common
     def _mock_init_weights(self, module):
         if hasattr(module, "weight") and module.weight is not None:
@@ -988,6 +996,10 @@ class SpeechT5ForTextToSpeechTest(ModelTesterMixin, unittest.TestCase):
         if hasattr(module, "bias") and module.bias is not None:
             module.bias.data.fill_(3)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
 
 @require_torch
 @require_sentencepiece
@@ -1404,6 +1416,10 @@ class SpeechT5ForSpeechToSpeechTest(ModelTesterMixin, unittest.TestCase):
         if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
             module.masked_spec_embed.data.fill_(3)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
 
 @require_torch
 @require_sentencepiece
@@ -1546,6 +1562,10 @@ class SpeechT5HifiGanTest(ModelTesterMixin, unittest.TestCase):
     def test_retain_grad_hidden_states_attentions(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     # skip because it fails on automapping of SpeechT5HifiGanConfig
     def test_save_load_fast_init_from_base(self):
         pass
diff --git a/tests/models/swiftformer/test_modeling_swiftformer.py b/tests/models/swiftformer/test_modeling_swiftformer.py
index e524719d23..151807c80c 100644
--- a/tests/models/swiftformer/test_modeling_swiftformer.py
+++ b/tests/models/swiftformer/test_modeling_swiftformer.py
@@ -272,6 +272,10 @@ class SwiftFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
                         msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                     )
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
 
 # We will verify our results on an image of cute cats
 def prepare_img():
diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py
index 2d42e29906..0df8da45cb 100644
--- a/tests/models/table_transformer/test_modeling_table_transformer.py
+++ b/tests/models/table_transformer/test_modeling_table_transformer.py
@@ -486,6 +486,10 @@ class TableTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, Pipelin
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
 
 TOLERANCE = 1e-4
 
diff --git a/tests/models/timm_backbone/test_modeling_timm_backbone.py b/tests/models/timm_backbone/test_modeling_timm_backbone.py
index 2af7c4c617..c134a588b6 100644
--- a/tests/models/timm_backbone/test_modeling_timm_backbone.py
+++ b/tests/models/timm_backbone/test_modeling_timm_backbone.py
@@ -196,6 +196,10 @@ class TimmBackboneModelTest(ModelTesterMixin, BackboneTesterMixin, PipelineTeste
     def test_can_use_safetensors(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/tvlt/test_modeling_tvlt.py b/tests/models/tvlt/test_modeling_tvlt.py
index 52af9b6f1b..41eefc9eb7 100644
--- a/tests/models/tvlt/test_modeling_tvlt.py
+++ b/tests/models/tvlt/test_modeling_tvlt.py
@@ -542,6 +542,10 @@ class TvltModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
 
 # We will verify our results on a video of eating spaghetti
 # Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
diff --git a/tests/models/upernet/test_modeling_upernet.py b/tests/models/upernet/test_modeling_upernet.py
index 79419f7b37..97ba37f8be 100644
--- a/tests/models/upernet/test_modeling_upernet.py
+++ b/tests/models/upernet/test_modeling_upernet.py
@@ -207,6 +207,10 @@ class UperNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index 424fc7d84a..6f2e7fa31e 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -344,6 +344,10 @@ class VideoMAEModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
 
 # We will verify our results on a video of eating spaghetti
 # Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py
index 8b63cd6d75..bb50cb9606 100644
--- a/tests/models/vit_mae/test_modeling_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_vit_mae.py
@@ -279,6 +279,10 @@ class ViTMAEModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     def test_model_outputs_equivalence(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in VIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/vivit/test_modeling_vivit.py b/tests/models/vivit/test_modeling_vivit.py
index 4440f46d04..d7d72eca7e 100644
--- a/tests/models/vivit/test_modeling_vivit.py
+++ b/tests/models/vivit/test_modeling_vivit.py
@@ -310,6 +310,10 @@ class VivitModelTest(ModelTesterMixin, unittest.TestCase):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
 
 # We will verify our results on a video of eating spaghetti
 # Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 7e7ca36b4c..0d5080ec5a 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -2705,6 +2705,18 @@ class ModelTesterMixin:
                     else:
                         new_model_without_prefix(input_ids)
 
+    def test_model_is_small(self):
+        # Just a consistency check to make sure we are not running tests on 80M parameter models.
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        # print(config)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            num_params = model.num_parameters()
+            assert (
+                num_params < 1000000
+            ), f"{model_class} is too big for the common tests ({num_params})! It should have 200k max."
+
 
 global_rng = random.Random()