From f295fc8a164b5882df94266a7f371c3158c930f5 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Tue, 25 Jul 2023 07:56:04 -0400
Subject: [PATCH] Fix last models for common tests that are too big. (#25058)

* Fix last models for common tests that are too big.

* Remove print statement
---
 .../perceiver/configuration_perceiver.py      |  6 +++
 .../models/perceiver/modeling_perceiver.py    | 14 +++----
 .../configuration_table_transformer.py        | 15 ++++++-
 .../models/layoutlm/test_modeling_layoutlm.py |  4 --
 .../layoutlmv2/test_modeling_layoutlmv2.py    |  2 +-
 .../oneformer/test_modeling_oneformer.py      | 18 ++++-----
 .../perceiver/test_modeling_perceiver.py      |  9 +++--
 .../segformer/test_modeling_segformer.py      | 10 ++---
 .../segformer/test_modeling_tf_segformer.py   |  6 +--
 .../models/speecht5/test_modeling_speecht5.py | 39 +++++++++----------
 .../swiftformer/test_modeling_swiftformer.py  | 10 ++---
 .../test_modeling_table_transformer.py        | 35 ++++++++++-------
 .../test_modeling_timm_backbone.py            |  4 +-
 tests/models/tvlt/test_modeling_tvlt.py       | 10 ++---
 tests/models/upernet/test_modeling_upernet.py | 10 ++---
 .../models/videomae/test_modeling_videomae.py |  8 ++--
 tests/models/vit_mae/test_modeling_vit_mae.py |  8 ++--
 tests/models/vivit/test_modeling_vivit.py     |  8 +---
 tests/test_modeling_common.py                 |  1 -
 19 files changed, 109 insertions(+), 108 deletions(-)

diff --git a/src/transformers/models/perceiver/configuration_perceiver.py b/src/transformers/models/perceiver/configuration_perceiver.py
index 86f5268fed..182e92b8a3 100644
--- a/src/transformers/models/perceiver/configuration_perceiver.py
+++ b/src/transformers/models/perceiver/configuration_perceiver.py
@@ -97,6 +97,8 @@ class PerceiverConfig(PretrainedConfig):
             Number of audio samples per frame for the multimodal autoencoding model.
         samples_per_patch (`int`, *optional*, defaults to 16):
             Number of audio samples per patch when preprocessing the audio for the multimodal autoencoding model.
+        output_num_channels (`int`, *optional*, defaults to 512):
+            Number of output channels for each modalitiy decoder.
         output_shape (`List[int]`, *optional*, defaults to `[1, 16, 224, 224]`):
             Shape of the output (batch_size, num_frames, height, width) for the video decoder queries of the multimodal
             autoencoding model. This excludes the channel dimension.
@@ -144,6 +146,8 @@ class PerceiverConfig(PretrainedConfig):
         audio_samples_per_frame=1920,
         samples_per_patch=16,
         output_shape=[1, 16, 224, 224],
+        output_num_channels=512,
+        _label_trainable_num_channels=1024,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -177,6 +181,8 @@ class PerceiverConfig(PretrainedConfig):
         self.audio_samples_per_frame = audio_samples_per_frame
         self.samples_per_patch = samples_per_patch
         self.output_shape = output_shape
+        self.output_num_channels = output_num_channels
+        self._label_trainable_num_channels = _label_trainable_num_channels
 
 
 class PerceiverOnnxConfig(OnnxConfig):
diff --git a/src/transformers/models/perceiver/modeling_perceiver.py b/src/transformers/models/perceiver/modeling_perceiver.py
index 9d2e2fcf98..bb7ac2bc31 100755
--- a/src/transformers/models/perceiver/modeling_perceiver.py
+++ b/src/transformers/models/perceiver/modeling_perceiver.py
@@ -1830,7 +1830,7 @@ class PerceiverForMultimodalAutoencoding(PerceiverPreTrainedModel):
             # Autoencoding, don't pass inputs to the queries.
             concat_preprocessed_input=False,
             output_shape=config.output_shape,
-            output_num_channels=512,
+            output_num_channels=config.output_num_channels,
             use_query_residual=False,
             position_encoding_only=True,
             position_encoding_type="fourier",
@@ -1854,7 +1854,7 @@ class PerceiverForMultimodalAutoencoding(PerceiverPreTrainedModel):
                     # Autoencoding, don't pass inputs to the queries.
                     concat_preprocessed_input=False,
                     output_index_dims=(n_audio_samples // config.samples_per_patch,),
-                    output_num_channels=512,
+                    output_num_channels=config.output_num_channels,
                     use_query_residual=False,
                     position_encoding_only=True,
                     position_encoding_type="fourier",
@@ -1874,21 +1874,21 @@ class PerceiverForMultimodalAutoencoding(PerceiverPreTrainedModel):
                     position_encoding_only=True,
                     position_encoding_type="trainable",
                     trainable_position_encoding_kwargs={
-                        "num_channels": 1024,
+                        "num_channels": config._label_trainable_num_channels,
                         "index_dims": 1,
                     },
                 ),
             },
             num_outputs=None,
-            output_num_channels=512,
+            output_num_channels=config.output_num_channels,
             use_query_residual=False,
         )
 
         output_postprocessor = PerceiverMultimodalPostprocessor(
             modalities={
-                "audio": PerceiverAudioPostprocessor(config, in_channels=512),
-                "image": PerceiverProjectionPostprocessor(in_channels=512, out_channels=3),
-                "label": PerceiverClassificationPostprocessor(config, in_channels=512),
+                "audio": PerceiverAudioPostprocessor(config, in_channels=config.output_num_channels),
+                "image": PerceiverProjectionPostprocessor(in_channels=config.output_num_channels, out_channels=3),
+                "label": PerceiverClassificationPostprocessor(config, in_channels=config.output_num_channels),
             }
         )
 
diff --git a/src/transformers/models/table_transformer/configuration_table_transformer.py b/src/transformers/models/table_transformer/configuration_table_transformer.py
index 94213008c6..250816ef49 100644
--- a/src/transformers/models/table_transformer/configuration_table_transformer.py
+++ b/src/transformers/models/table_transformer/configuration_table_transformer.py
@@ -13,9 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Table Transformer model configuration"""
-
+import copy
 from collections import OrderedDict
-from typing import Mapping
+from typing import Dict, Mapping
 
 from packaging import version
 
@@ -237,6 +237,17 @@ class TableTransformerConfig(PretrainedConfig):
     def hidden_size(self) -> int:
         return self.d_model
 
+    def to_dict(self) -> Dict[str, any]:
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        if output["backbone_config"] is not None:
+            output["backbone_config"] = self.backbone_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
+
 
 # Copied from transformers.models.detr.configuration_detr.DetrOnnxConfig
 class TableTransformerOnnxConfig(OnnxConfig):
diff --git a/tests/models/layoutlm/test_modeling_layoutlm.py b/tests/models/layoutlm/test_modeling_layoutlm.py
index 687d1ae4a5..0535fbf4e1 100644
--- a/tests/models/layoutlm/test_modeling_layoutlm.py
+++ b/tests/models/layoutlm/test_modeling_layoutlm.py
@@ -279,10 +279,6 @@ class LayoutLMModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
 
 def prepare_layoutlm_batch_inputs():
     # Here we prepare a batch of 2 sequences to test a LayoutLM forward pass on:
diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
index 4eda8952c3..c8457331c5 100644
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -415,7 +415,7 @@ class LayoutLMv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    @unittest.skip("We cannot configure detectron2 to output a smaller backbone")
     def test_model_is_small(self):
         pass
 
diff --git a/tests/models/oneformer/test_modeling_oneformer.py b/tests/models/oneformer/test_modeling_oneformer.py
index f23c0f265c..ef4a45021a 100644
--- a/tests/models/oneformer/test_modeling_oneformer.py
+++ b/tests/models/oneformer/test_modeling_oneformer.py
@@ -112,16 +112,20 @@ class OneFormerModelTester:
         config = OneFormerConfig(
             text_encoder_vocab_size=self.vocab_size,
             hidden_size=self.hidden_dim,
+            num_queries=self.num_queries,
+            num_labels=self.num_labels,
+            encoder_feedforward_dim=32,
+            dim_feedforward=64,
+            encoder_layers=2,
+            decoder_layers=2,
         )
 
-        config.num_queries = self.num_queries
-        config.num_labels = self.num_labels
-
+        config.backbone_config.embed_dim = 16
         config.backbone_config.depths = [1, 1, 1, 1]
+        config.backbone_config.hidden_size = 16
         config.backbone_config.num_channels = self.num_channels
+        config.backbone_config.num_heads = [1, 1, 2, 2]
 
-        config.encoder_feedforward_dim = 64
-        config.dim_feedforward = 128
         config.hidden_dim = self.hidden_dim
         config.mask_dim = self.hidden_dim
         config.conv_dim = self.hidden_dim
@@ -309,10 +313,6 @@ class OneFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
             expected_arg_names = ["pixel_values", "task_inputs"]
             self.assertListEqual(arg_names[:2], expected_arg_names)
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in ["shi-labs/oneformer_ade20k_swin_tiny"]:
diff --git a/tests/models/perceiver/test_modeling_perceiver.py b/tests/models/perceiver/test_modeling_perceiver.py
index 23bd75bdd1..91fac90e7b 100644
--- a/tests/models/perceiver/test_modeling_perceiver.py
+++ b/tests/models/perceiver/test_modeling_perceiver.py
@@ -79,6 +79,7 @@ class PerceiverModelTester:
         nchunks=20,
         num_latents=10,
         d_latents=20,
+        d_model=64,
         num_blocks=1,
         num_self_attends_per_block=2,
         num_self_attention_heads=1,
@@ -108,6 +109,7 @@ class PerceiverModelTester:
         self.nchunks = nchunks
         self.num_latents = num_latents
         self.d_latents = d_latents
+        self.d_model = d_model
         self.num_blocks = num_blocks
         self.num_self_attends_per_block = num_self_attends_per_block
         self.num_self_attention_heads = num_self_attention_heads
@@ -181,6 +183,7 @@ class PerceiverModelTester:
         return PerceiverConfig(
             num_latents=self.num_latents,
             d_latents=self.d_latents,
+            d_model=self.d_model,
             qk_channels=self.d_latents,
             v_channels=self.d_latents,
             num_blocks=self.num_blocks,
@@ -200,6 +203,8 @@ class PerceiverModelTester:
             audio_samples_per_frame=self.audio_samples_per_frame,
             samples_per_patch=self.samples_per_patch,
             num_labels=self.num_labels,
+            output_num_channels=32,
+            _label_trainable_num_channels=16,
         )
 
     def get_pipeline_config(self):
@@ -784,10 +789,6 @@ class PerceiverModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
 
                     loss.backward()
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     @require_torch_multi_gpu
     @unittest.skip(
         reason=(
diff --git a/tests/models/segformer/test_modeling_segformer.py b/tests/models/segformer/test_modeling_segformer.py
index 7f19001148..0506be9b1f 100644
--- a/tests/models/segformer/test_modeling_segformer.py
+++ b/tests/models/segformer/test_modeling_segformer.py
@@ -61,11 +61,11 @@ class SegformerModelTester:
         image_size=64,
         num_channels=3,
         num_encoder_blocks=4,
-        depths=[2, 2, 2, 2],
+        depths=[1, 1, 1, 1],
         sr_ratios=[8, 4, 2, 1],
-        hidden_sizes=[16, 32, 64, 128],
+        hidden_sizes=[8, 8, 16, 16],
         downsampling_rates=[1, 4, 8, 16],
-        num_attention_heads=[1, 2, 4, 8],
+        num_attention_heads=[1, 1, 2, 2],
         is_training=True,
         use_labels=True,
         hidden_act="gelu",
@@ -347,10 +347,6 @@ class SegformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
             loss = model(**inputs).loss
             loss.backward()
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/segformer/test_modeling_tf_segformer.py b/tests/models/segformer/test_modeling_tf_segformer.py
index b4dc657b79..aca621f509 100644
--- a/tests/models/segformer/test_modeling_tf_segformer.py
+++ b/tests/models/segformer/test_modeling_tf_segformer.py
@@ -58,11 +58,11 @@ class TFSegformerModelTester:
         image_size=64,
         num_channels=3,
         num_encoder_blocks=4,
-        depths=[2, 2, 2, 2],
+        depths=[1, 1, 1, 1],
         sr_ratios=[8, 4, 2, 1],
-        hidden_sizes=[16, 32, 64, 128],
+        hidden_sizes=[8, 8, 16, 16],
         downsampling_rates=[1, 4, 8, 16],
-        num_attention_heads=[1, 2, 4, 8],
+        num_attention_heads=[1, 1, 2, 2],
         is_training=True,
         use_labels=True,
         hidden_act="gelu",
diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py
index fbd09ef500..c357259d78 100644
--- a/tests/models/speecht5/test_modeling_speecht5.py
+++ b/tests/models/speecht5/test_modeling_speecht5.py
@@ -238,10 +238,6 @@ class SpeechT5ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
         # disabled because this model doesn't have decoder_input_ids
         pass
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
 
 @require_torch
 class SpeechT5ForSpeechToTextTester:
@@ -705,10 +701,6 @@ class SpeechT5ForSpeechToTextTest(ModelTesterMixin, unittest.TestCase):
     def test_training_gradient_checkpointing(self):
         pass
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     # overwrite from test_modeling_common
     def _mock_init_weights(self, module):
         if hasattr(module, "weight") and module.weight is not None:
@@ -800,6 +792,9 @@ class SpeechT5ForTextToSpeechTester:
         vocab_size=81,
         num_mel_bins=20,
         reduction_factor=2,
+        speech_decoder_postnet_layers=2,
+        speech_decoder_postnet_units=32,
+        speech_decoder_prenet_units=32,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -813,6 +808,9 @@ class SpeechT5ForTextToSpeechTester:
         self.vocab_size = vocab_size
         self.num_mel_bins = num_mel_bins
         self.reduction_factor = reduction_factor
+        self.speech_decoder_postnet_layers = speech_decoder_postnet_layers
+        self.speech_decoder_postnet_units = speech_decoder_postnet_units
+        self.speech_decoder_prenet_units = speech_decoder_prenet_units
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size).clamp(2)
@@ -847,6 +845,9 @@ class SpeechT5ForTextToSpeechTester:
             vocab_size=self.vocab_size,
             num_mel_bins=self.num_mel_bins,
             reduction_factor=self.reduction_factor,
+            speech_decoder_postnet_layers=self.speech_decoder_postnet_layers,
+            speech_decoder_postnet_units=self.speech_decoder_postnet_units,
+            speech_decoder_prenet_units=self.speech_decoder_prenet_units,
         )
 
     def create_and_check_model_forward(self, config, inputs_dict):
@@ -996,10 +997,6 @@ class SpeechT5ForTextToSpeechTest(ModelTesterMixin, unittest.TestCase):
         if hasattr(module, "bias") and module.bias is not None:
             module.bias.data.fill_(3)
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
 
 @require_torch
 @require_sentencepiece
@@ -1046,6 +1043,9 @@ class SpeechT5ForSpeechToSpeechTester:
         vocab_size=81,
         num_mel_bins=20,
         reduction_factor=2,
+        speech_decoder_postnet_layers=2,
+        speech_decoder_postnet_units=32,
+        speech_decoder_prenet_units=32,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -1065,6 +1065,9 @@ class SpeechT5ForSpeechToSpeechTester:
         self.vocab_size = vocab_size
         self.num_mel_bins = num_mel_bins
         self.reduction_factor = reduction_factor
+        self.speech_decoder_postnet_layers = speech_decoder_postnet_layers
+        self.speech_decoder_postnet_units = speech_decoder_postnet_units
+        self.speech_decoder_prenet_units = speech_decoder_prenet_units
 
     def prepare_config_and_inputs(self):
         input_values = floats_tensor([self.batch_size, self.encoder_seq_length], scale=1.0)
@@ -1105,6 +1108,9 @@ class SpeechT5ForSpeechToSpeechTester:
             vocab_size=self.vocab_size,
             num_mel_bins=self.num_mel_bins,
             reduction_factor=self.reduction_factor,
+            speech_decoder_postnet_layers=self.speech_decoder_postnet_layers,
+            speech_decoder_postnet_units=self.speech_decoder_postnet_units,
+            speech_decoder_prenet_units=self.speech_decoder_prenet_units,
         )
 
     def create_and_check_model_forward(self, config, inputs_dict):
@@ -1416,10 +1422,6 @@ class SpeechT5ForSpeechToSpeechTest(ModelTesterMixin, unittest.TestCase):
         if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
             module.masked_spec_embed.data.fill_(3)
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
 
 @require_torch
 @require_sentencepiece
@@ -1478,6 +1480,7 @@ class SpeechT5HifiGanTester:
     def get_config(self):
         return SpeechT5HifiGanConfig(
             model_in_dim=self.num_mel_bins,
+            upsample_initial_channel=32,
         )
 
     def create_and_check_model(self, config, input_values):
@@ -1562,10 +1565,6 @@ class SpeechT5HifiGanTest(ModelTesterMixin, unittest.TestCase):
     def test_retain_grad_hidden_states_attentions(self):
         pass
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     # skip because it fails on automapping of SpeechT5HifiGanConfig
     def test_save_load_fast_init_from_base(self):
         pass
diff --git a/tests/models/swiftformer/test_modeling_swiftformer.py b/tests/models/swiftformer/test_modeling_swiftformer.py
index 151807c80c..3e286cc320 100644
--- a/tests/models/swiftformer/test_modeling_swiftformer.py
+++ b/tests/models/swiftformer/test_modeling_swiftformer.py
@@ -58,9 +58,9 @@ class SwiftFormerModelTester:
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
         image_size=224,
-        num_labels=1000,
-        layer_depths=[3, 3, 6, 4],
-        embed_dims=[48, 56, 112, 220],
+        num_labels=3,
+        layer_depths=[1, 1, 1, 1],
+        embed_dims=[16, 16, 32, 32],
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -272,10 +272,6 @@ class SwiftFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
                         msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                     )
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
 
 # We will verify our results on an image of cute cats
 def prepare_img():
diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py
index 0df8da45cb..d81c52ff13 100644
--- a/tests/models/table_transformer/test_modeling_table_transformer.py
+++ b/tests/models/table_transformer/test_modeling_table_transformer.py
@@ -21,8 +21,8 @@ import unittest
 
 from huggingface_hub import hf_hub_download
 
-from transformers import TableTransformerConfig, is_timm_available, is_vision_available
-from transformers.testing_utils import require_timm, require_vision, slow, torch_device
+from transformers import ResNetConfig, TableTransformerConfig, is_torch_available, is_vision_available
+from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -30,10 +30,10 @@ from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
-if is_timm_available():
+if is_torch_available():
     import torch
 
-    from transformers import ResNetConfig, TableTransformerForObjectDetection, TableTransformerModel
+    from transformers import TableTransformerForObjectDetection, TableTransformerModel
 
 
 if is_vision_available():
@@ -49,7 +49,7 @@ class TableTransformerModelTester:
         batch_size=8,
         is_training=True,
         use_labels=True,
-        hidden_size=256,
+        hidden_size=32,
         num_hidden_layers=2,
         num_attention_heads=8,
         intermediate_size=4,
@@ -61,7 +61,7 @@ class TableTransformerModelTester:
         min_size=200,
         max_size=200,
         n_targets=8,
-        num_labels=91,
+        num_labels=3,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -107,6 +107,16 @@ class TableTransformerModelTester:
         return config, pixel_values, pixel_mask, labels
 
     def get_config(self):
+        resnet_config = ResNetConfig(
+            num_channels=3,
+            embeddings_size=10,
+            hidden_sizes=[10, 20, 30, 40],
+            depths=[1, 1, 2, 1],
+            hidden_act="relu",
+            num_labels=3,
+            out_features=["stage2", "stage3", "stage4"],
+            out_indices=[2, 3, 4],
+        )
         return TableTransformerConfig(
             d_model=self.hidden_size,
             encoder_layers=self.num_hidden_layers,
@@ -119,6 +129,8 @@ class TableTransformerModelTester:
             attention_dropout=self.attention_probs_dropout_prob,
             num_queries=self.num_queries,
             num_labels=self.num_labels,
+            use_timm_backbone=False,
+            backbone_config=resnet_config,
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -175,19 +187,19 @@ class TableTransformerModelTester:
         self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
 
 
-@require_timm
+@require_torch
 class TableTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TableTransformerModel,
             TableTransformerForObjectDetection,
         )
-        if is_timm_available()
+        if is_torch_available()
         else ()
     )
     pipeline_model_mapping = (
         {"feature-extraction": TableTransformerModel, "object-detection": TableTransformerForObjectDetection}
-        if is_timm_available()
+        if is_torch_available()
         else {}
     )
     is_encoder_decoder = True
@@ -453,6 +465,7 @@ class TableTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, Pipelin
 
         # let's set num_channels to 1
         config.num_channels = 1
+        config.backbone_config.num_channels = 1
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -486,10 +499,6 @@ class TableTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, Pipelin
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
 
 TOLERANCE = 1e-4
 
diff --git a/tests/models/timm_backbone/test_modeling_timm_backbone.py b/tests/models/timm_backbone/test_modeling_timm_backbone.py
index c134a588b6..eaa022422c 100644
--- a/tests/models/timm_backbone/test_modeling_timm_backbone.py
+++ b/tests/models/timm_backbone/test_modeling_timm_backbone.py
@@ -42,7 +42,7 @@ class TimmBackboneModelTester:
         out_indices=None,
         out_features=None,
         stage_names=None,
-        backbone="resnet50",
+        backbone="resnet18",
         batch_size=3,
         image_size=32,
         num_channels=3,
@@ -196,7 +196,7 @@ class TimmBackboneModelTest(ModelTesterMixin, BackboneTesterMixin, PipelineTeste
     def test_can_use_safetensors(self):
         pass
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    @unittest.skip("Need to use a timm backbone and there is no tiny model available.")
     def test_model_is_small(self):
         pass
 
diff --git a/tests/models/tvlt/test_modeling_tvlt.py b/tests/models/tvlt/test_modeling_tvlt.py
index 41eefc9eb7..e437b2651e 100644
--- a/tests/models/tvlt/test_modeling_tvlt.py
+++ b/tests/models/tvlt/test_modeling_tvlt.py
@@ -67,8 +67,8 @@ class TvltModelTester:
         num_image_channels=3,
         num_audio_channels=1,
         num_frames=2,
-        hidden_size=128,
-        num_hidden_layers=12,
+        hidden_size=32,
+        num_hidden_layers=3,
         num_attention_heads=4,
         intermediate_size=128,
         hidden_act="gelu",
@@ -79,7 +79,7 @@ class TvltModelTester:
         qkv_bias=True,
         use_mean_pooling=True,
         decoder_num_attention_heads=4,
-        decoder_hidden_size=64,
+        decoder_hidden_size=32,
         decoder_num_hidden_layers=2,
         decoder_intermediate_size=128,
         image_mask_ratio=0.75,
@@ -542,10 +542,6 @@ class TvltModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
 
 # We will verify our results on a video of eating spaghetti
 # Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
diff --git a/tests/models/upernet/test_modeling_upernet.py b/tests/models/upernet/test_modeling_upernet.py
index 97ba37f8be..84c32f7233 100644
--- a/tests/models/upernet/test_modeling_upernet.py
+++ b/tests/models/upernet/test_modeling_upernet.py
@@ -51,7 +51,7 @@ class UperNetModelTester:
         num_channels=3,
         num_stages=4,
         hidden_sizes=[10, 20, 30, 40],
-        depths=[2, 2, 3, 2],
+        depths=[1, 1, 1, 1],
         is_training=True,
         use_labels=True,
         intermediate_size=37,
@@ -106,12 +106,12 @@ class UperNetModelTester:
     def get_config(self):
         return UperNetConfig(
             backbone_config=self.get_backbone_config(),
-            hidden_size=512,
+            hidden_size=64,
             pool_scales=[1, 2, 3, 6],
             use_auxiliary_head=True,
             auxiliary_loss_weight=0.4,
             auxiliary_in_channels=40,
-            auxiliary_channels=256,
+            auxiliary_channels=32,
             auxiliary_num_convs=1,
             auxiliary_concat_input=False,
             loss_ignore_index=255,
@@ -207,10 +207,6 @@ class UperNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index 6f2e7fa31e..85a0d2714e 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -130,6 +130,10 @@ class VideoMAEModelTester:
             attention_probs_dropout_prob=self.attention_probs_dropout_prob,
             is_decoder=False,
             initializer_range=self.initializer_range,
+            decoder_hidden_size=self.hidden_size,
+            decoder_intermediate_size=self.intermediate_size,
+            decoder_num_attention_heads=self.num_attention_heads,
+            decoder_num_hidden_layers=self.num_hidden_layers,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -344,10 +348,6 @@ class VideoMAEModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
 
 # We will verify our results on a video of eating spaghetti
 # Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py
index bb50cb9606..3cedb0c176 100644
--- a/tests/models/vit_mae/test_modeling_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_vit_mae.py
@@ -118,6 +118,10 @@ class ViTMAEModelTester:
             is_decoder=False,
             initializer_range=self.initializer_range,
             mask_ratio=self.mask_ratio,
+            decoder_hidden_size=self.hidden_size,
+            decoder_intermediate_size=self.intermediate_size,
+            decoder_num_attention_heads=self.num_attention_heads,
+            decoder_num_hidden_layers=self.num_hidden_layers,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -279,10 +283,6 @@ class ViTMAEModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     def test_model_outputs_equivalence(self):
         pass
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in VIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/vivit/test_modeling_vivit.py b/tests/models/vivit/test_modeling_vivit.py
index d7d72eca7e..43db8bad7b 100644
--- a/tests/models/vivit/test_modeling_vivit.py
+++ b/tests/models/vivit/test_modeling_vivit.py
@@ -55,8 +55,8 @@ class VivitModelTester:
         num_frames=8,  # decreased, because default 32 takes too much RAM at inference
         tubelet_size=[2, 4, 4],
         num_channels=3,
-        hidden_size=768,
-        num_hidden_layers=5,
+        hidden_size=32,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu_fast",
@@ -310,10 +310,6 @@ class VivitModelTest(ModelTesterMixin, unittest.TestCase):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
 
 # We will verify our results on a video of eating spaghetti
 # Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 0d5080ec5a..87960983f9 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -2708,7 +2708,6 @@ class ModelTesterMixin:
     def test_model_is_small(self):
         # Just a consistency check to make sure we are not running tests on 80M parameter models.
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        # print(config)
 
         for model_class in self.all_model_classes:
             model = model_class(config)