Fix last models for common tests that are too big. (#25058)

* Fix last models for common tests that are too big. * Remove print statement
2023-07-25 07:56:04 -04:00
parent ee1eb3b325
commit f295fc8a16
19 changed files with 109 additions and 108 deletions
--- a/src/transformers/models/perceiver/configuration_perceiver.py
+++ b/src/transformers/models/perceiver/configuration_perceiver.py
@@ -97,6 +97,8 @@ class PerceiverConfig(PretrainedConfig):
            Number of audio samples per frame for the multimodal autoencoding model.
        samples_per_patch (`int`, *optional*, defaults to 16):
            Number of audio samples per patch when preprocessing the audio for the multimodal autoencoding model.
+        output_num_channels (`int`, *optional*, defaults to 512):
+            Number of output channels for each modalitiy decoder.
        output_shape (`List[int]`, *optional*, defaults to `[1, 16, 224, 224]`):
            Shape of the output (batch_size, num_frames, height, width) for the video decoder queries of the multimodal
            autoencoding model. This excludes the channel dimension.
@@ -144,6 +146,8 @@ class PerceiverConfig(PretrainedConfig):
        audio_samples_per_frame=1920,
        samples_per_patch=16,
        output_shape=[1, 16, 224, 224],
+        output_num_channels=512,
+        _label_trainable_num_channels=1024,
        **kwargs,
    ):
        super().__init__(**kwargs)
@@ -177,6 +181,8 @@ class PerceiverConfig(PretrainedConfig):
        self.audio_samples_per_frame = audio_samples_per_frame
        self.samples_per_patch = samples_per_patch
        self.output_shape = output_shape
+        self.output_num_channels = output_num_channels
+        self._label_trainable_num_channels = _label_trainable_num_channels


 class PerceiverOnnxConfig(OnnxConfig):
--- a/src/transformers/models/perceiver/modeling_perceiver.py
+++ b/src/transformers/models/perceiver/modeling_perceiver.py
@@ -1830,7 +1830,7 @@ class PerceiverForMultimodalAutoencoding(PerceiverPreTrainedModel):
            # Autoencoding, don't pass inputs to the queries.
            concat_preprocessed_input=False,
            output_shape=config.output_shape,
-            output_num_channels=512,
+            output_num_channels=config.output_num_channels,
            use_query_residual=False,
            position_encoding_only=True,
            position_encoding_type="fourier",
@@ -1854,7 +1854,7 @@ class PerceiverForMultimodalAutoencoding(PerceiverPreTrainedModel):
                    # Autoencoding, don't pass inputs to the queries.
                    concat_preprocessed_input=False,
                    output_index_dims=(n_audio_samples // config.samples_per_patch,),
-                    output_num_channels=512,
+                    output_num_channels=config.output_num_channels,
                    use_query_residual=False,
                    position_encoding_only=True,
                    position_encoding_type="fourier",
@@ -1874,21 +1874,21 @@ class PerceiverForMultimodalAutoencoding(PerceiverPreTrainedModel):
                    position_encoding_only=True,
                    position_encoding_type="trainable",
                    trainable_position_encoding_kwargs={
-                        "num_channels": 1024,
+                        "num_channels": config._label_trainable_num_channels,
                        "index_dims": 1,
                    },
                ),
            },
            num_outputs=None,
-            output_num_channels=512,
+            output_num_channels=config.output_num_channels,
            use_query_residual=False,
        )

        output_postprocessor = PerceiverMultimodalPostprocessor(
            modalities={
-                "audio": PerceiverAudioPostprocessor(config, in_channels=512),
-                "image": PerceiverProjectionPostprocessor(in_channels=512, out_channels=3),
-                "label": PerceiverClassificationPostprocessor(config, in_channels=512),
+                "audio": PerceiverAudioPostprocessor(config, in_channels=config.output_num_channels),
+                "image": PerceiverProjectionPostprocessor(in_channels=config.output_num_channels, out_channels=3),
+                "label": PerceiverClassificationPostprocessor(config, in_channels=config.output_num_channels),
            }
        )

--- a/src/transformers/models/table_transformer/configuration_table_transformer.py
+++ b/src/transformers/models/table_transformer/configuration_table_transformer.py
@@ -13,9 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Table Transformer model configuration"""
-
+import copy
 from collections import OrderedDict
-from typing import Mapping
+from typing import Dict, Mapping

 from packaging import version

@@ -237,6 +237,17 @@ class TableTransformerConfig(PretrainedConfig):
    def hidden_size(self) -> int:
        return self.d_model

+    def to_dict(self) -> Dict[str, any]:
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        if output["backbone_config"] is not None:
+            output["backbone_config"] = self.backbone_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
+

 # Copied from transformers.models.detr.configuration_detr.DetrOnnxConfig
 class TableTransformerOnnxConfig(OnnxConfig):
--- a/tests/models/layoutlm/test_modeling_layoutlm.py
+++ b/tests/models/layoutlm/test_modeling_layoutlm.py
@@ -279,10 +279,6 @@ class LayoutLMModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)

-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-

 def prepare_layoutlm_batch_inputs():
    # Here we prepare a batch of 2 sequences to test a LayoutLM forward pass on:
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -415,7 +415,7 @@ class LayoutLMv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa

            check_hidden_states_output(inputs_dict, config, model_class)

-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    @unittest.skip("We cannot configure detectron2 to output a smaller backbone")
    def test_model_is_small(self):
        pass

--- a/tests/models/oneformer/test_modeling_oneformer.py
+++ b/tests/models/oneformer/test_modeling_oneformer.py
@@ -112,16 +112,20 @@ class OneFormerModelTester:
        config = OneFormerConfig(
            text_encoder_vocab_size=self.vocab_size,
            hidden_size=self.hidden_dim,
+            num_queries=self.num_queries,
+            num_labels=self.num_labels,
+            encoder_feedforward_dim=32,
+            dim_feedforward=64,
+            encoder_layers=2,
+            decoder_layers=2,
        )

-        config.num_queries = self.num_queries
-        config.num_labels = self.num_labels
-
+        config.backbone_config.embed_dim = 16
        config.backbone_config.depths = [1, 1, 1, 1]
+        config.backbone_config.hidden_size = 16
        config.backbone_config.num_channels = self.num_channels
+        config.backbone_config.num_heads = [1, 1, 2, 2]

-        config.encoder_feedforward_dim = 64
-        config.dim_feedforward = 128
        config.hidden_dim = self.hidden_dim
        config.mask_dim = self.hidden_dim
        config.conv_dim = self.hidden_dim
@@ -309,10 +313,6 @@ class OneFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
            expected_arg_names = ["pixel_values", "task_inputs"]
            self.assertListEqual(arg_names[:2], expected_arg_names)

-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
    @slow
    def test_model_from_pretrained(self):
        for model_name in ["shi-labs/oneformer_ade20k_swin_tiny"]:
--- a/tests/models/perceiver/test_modeling_perceiver.py
+++ b/tests/models/perceiver/test_modeling_perceiver.py
@@ -79,6 +79,7 @@ class PerceiverModelTester:
        nchunks=20,
        num_latents=10,
        d_latents=20,
+        d_model=64,
        num_blocks=1,
        num_self_attends_per_block=2,
        num_self_attention_heads=1,
@@ -108,6 +109,7 @@ class PerceiverModelTester:
        self.nchunks = nchunks
        self.num_latents = num_latents
        self.d_latents = d_latents
+        self.d_model = d_model
        self.num_blocks = num_blocks
        self.num_self_attends_per_block = num_self_attends_per_block
        self.num_self_attention_heads = num_self_attention_heads
@@ -181,6 +183,7 @@ class PerceiverModelTester:
        return PerceiverConfig(
            num_latents=self.num_latents,
            d_latents=self.d_latents,
+            d_model=self.d_model,
            qk_channels=self.d_latents,
            v_channels=self.d_latents,
            num_blocks=self.num_blocks,
@@ -200,6 +203,8 @@ class PerceiverModelTester:
            audio_samples_per_frame=self.audio_samples_per_frame,
            samples_per_patch=self.samples_per_patch,
            num_labels=self.num_labels,
+            output_num_channels=32,
+            _label_trainable_num_channels=16,
        )

    def get_pipeline_config(self):
@@ -784,10 +789,6 @@ class PerceiverModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas

                    loss.backward()

-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
    @require_torch_multi_gpu
    @unittest.skip(
        reason=(
--- a/tests/models/segformer/test_modeling_segformer.py
+++ b/tests/models/segformer/test_modeling_segformer.py
@@ -61,11 +61,11 @@ class SegformerModelTester:
        image_size=64,
        num_channels=3,
        num_encoder_blocks=4,
-        depths=[2, 2, 2, 2],
+        depths=[1, 1, 1, 1],
        sr_ratios=[8, 4, 2, 1],
-        hidden_sizes=[16, 32, 64, 128],
+        hidden_sizes=[8, 8, 16, 16],
        downsampling_rates=[1, 4, 8, 16],
-        num_attention_heads=[1, 2, 4, 8],
+        num_attention_heads=[1, 1, 2, 2],
        is_training=True,
        use_labels=True,
        hidden_act="gelu",
@@ -347,10 +347,6 @@ class SegformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
            loss = model(**inputs).loss
            loss.backward()

-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
    @slow
    def test_model_from_pretrained(self):
        for model_name in SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
--- a/tests/models/segformer/test_modeling_tf_segformer.py
+++ b/tests/models/segformer/test_modeling_tf_segformer.py
@@ -58,11 +58,11 @@ class TFSegformerModelTester:
        image_size=64,
        num_channels=3,
        num_encoder_blocks=4,
-        depths=[2, 2, 2, 2],
+        depths=[1, 1, 1, 1],
        sr_ratios=[8, 4, 2, 1],
-        hidden_sizes=[16, 32, 64, 128],
+        hidden_sizes=[8, 8, 16, 16],
        downsampling_rates=[1, 4, 8, 16],
-        num_attention_heads=[1, 2, 4, 8],
+        num_attention_heads=[1, 1, 2, 2],
        is_training=True,
        use_labels=True,
        hidden_act="gelu",
--- a/tests/models/speecht5/test_modeling_speecht5.py
+++ b/tests/models/speecht5/test_modeling_speecht5.py
@@ -238,10 +238,6 @@ class SpeechT5ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
        # disabled because this model doesn't have decoder_input_ids
        pass

-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-

@require_torch
 class SpeechT5ForSpeechToTextTester:
@@ -705,10 +701,6 @@ class SpeechT5ForSpeechToTextTest(ModelTesterMixin, unittest.TestCase):
    def test_training_gradient_checkpointing(self):
        pass

-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
    # overwrite from test_modeling_common
    def _mock_init_weights(self, module):
        if hasattr(module, "weight") and module.weight is not None:
@@ -800,6 +792,9 @@ class SpeechT5ForTextToSpeechTester:
        vocab_size=81,
        num_mel_bins=20,
        reduction_factor=2,
+        speech_decoder_postnet_layers=2,
+        speech_decoder_postnet_units=32,
+        speech_decoder_prenet_units=32,
    ):
        self.parent = parent
        self.batch_size = batch_size
@@ -813,6 +808,9 @@ class SpeechT5ForTextToSpeechTester:
        self.vocab_size = vocab_size
        self.num_mel_bins = num_mel_bins
        self.reduction_factor = reduction_factor
+        self.speech_decoder_postnet_layers = speech_decoder_postnet_layers
+        self.speech_decoder_postnet_units = speech_decoder_postnet_units
+        self.speech_decoder_prenet_units = speech_decoder_prenet_units

    def prepare_config_and_inputs(self):
        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size).clamp(2)
@@ -847,6 +845,9 @@ class SpeechT5ForTextToSpeechTester:
            vocab_size=self.vocab_size,
            num_mel_bins=self.num_mel_bins,
            reduction_factor=self.reduction_factor,
+            speech_decoder_postnet_layers=self.speech_decoder_postnet_layers,
+            speech_decoder_postnet_units=self.speech_decoder_postnet_units,
+            speech_decoder_prenet_units=self.speech_decoder_prenet_units,
        )

    def create_and_check_model_forward(self, config, inputs_dict):
@@ -996,10 +997,6 @@ class SpeechT5ForTextToSpeechTest(ModelTesterMixin, unittest.TestCase):
        if hasattr(module, "bias") and module.bias is not None:
            module.bias.data.fill_(3)

-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-

@require_torch
@require_sentencepiece
@@ -1046,6 +1043,9 @@ class SpeechT5ForSpeechToSpeechTester:
        vocab_size=81,
        num_mel_bins=20,
        reduction_factor=2,
+        speech_decoder_postnet_layers=2,
+        speech_decoder_postnet_units=32,
+        speech_decoder_prenet_units=32,
    ):
        self.parent = parent
        self.batch_size = batch_size
@@ -1065,6 +1065,9 @@ class SpeechT5ForSpeechToSpeechTester:
        self.vocab_size = vocab_size
        self.num_mel_bins = num_mel_bins
        self.reduction_factor = reduction_factor
+        self.speech_decoder_postnet_layers = speech_decoder_postnet_layers
+        self.speech_decoder_postnet_units = speech_decoder_postnet_units
+        self.speech_decoder_prenet_units = speech_decoder_prenet_units

    def prepare_config_and_inputs(self):
        input_values = floats_tensor([self.batch_size, self.encoder_seq_length], scale=1.0)
@@ -1105,6 +1108,9 @@ class SpeechT5ForSpeechToSpeechTester:
            vocab_size=self.vocab_size,
            num_mel_bins=self.num_mel_bins,
            reduction_factor=self.reduction_factor,
+            speech_decoder_postnet_layers=self.speech_decoder_postnet_layers,
+            speech_decoder_postnet_units=self.speech_decoder_postnet_units,
+            speech_decoder_prenet_units=self.speech_decoder_prenet_units,
        )

    def create_and_check_model_forward(self, config, inputs_dict):
@@ -1416,10 +1422,6 @@ class SpeechT5ForSpeechToSpeechTest(ModelTesterMixin, unittest.TestCase):
        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
            module.masked_spec_embed.data.fill_(3)

-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-

@require_torch
@require_sentencepiece
@@ -1478,6 +1480,7 @@ class SpeechT5HifiGanTester:
    def get_config(self):
        return SpeechT5HifiGanConfig(
            model_in_dim=self.num_mel_bins,
+            upsample_initial_channel=32,
        )

    def create_and_check_model(self, config, input_values):
@@ -1562,10 +1565,6 @@ class SpeechT5HifiGanTest(ModelTesterMixin, unittest.TestCase):
    def test_retain_grad_hidden_states_attentions(self):
        pass

-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
    # skip because it fails on automapping of SpeechT5HifiGanConfig
    def test_save_load_fast_init_from_base(self):
        pass
--- a/tests/models/swiftformer/test_modeling_swiftformer.py
+++ b/tests/models/swiftformer/test_modeling_swiftformer.py
@@ -58,9 +58,9 @@ class SwiftFormerModelTester:
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        image_size=224,
-        num_labels=1000,
-        layer_depths=[3, 3, 6, 4],
-        embed_dims=[48, 56, 112, 220],
+        num_labels=3,
+        layer_depths=[1, 1, 1, 1],
+        embed_dims=[16, 16, 32, 32],
    ):
        self.parent = parent
        self.batch_size = batch_size
@@ -272,10 +272,6 @@ class SwiftFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                    )

-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-

 # We will verify our results on an image of cute cats
 def prepare_img():
--- a/tests/models/table_transformer/test_modeling_table_transformer.py
+++ b/tests/models/table_transformer/test_modeling_table_transformer.py
@@ -21,8 +21,8 @@ import unittest

 from huggingface_hub import hf_hub_download

-from transformers import TableTransformerConfig, is_timm_available, is_vision_available
-from transformers.testing_utils import require_timm, require_vision, slow, torch_device
+from transformers import ResNetConfig, TableTransformerConfig, is_torch_available, is_vision_available
+from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device

 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -30,10 +30,10 @@ from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_
 from ...test_pipeline_mixin import PipelineTesterMixin


-if is_timm_available():
+if is_torch_available():
    import torch

-    from transformers import ResNetConfig, TableTransformerForObjectDetection, TableTransformerModel
+    from transformers import TableTransformerForObjectDetection, TableTransformerModel


 if is_vision_available():
@@ -49,7 +49,7 @@ class TableTransformerModelTester:
        batch_size=8,
        is_training=True,
        use_labels=True,
-        hidden_size=256,
+        hidden_size=32,
        num_hidden_layers=2,
        num_attention_heads=8,
        intermediate_size=4,
@@ -61,7 +61,7 @@ class TableTransformerModelTester:
        min_size=200,
        max_size=200,
        n_targets=8,
-        num_labels=91,
+        num_labels=3,
    ):
        self.parent = parent
        self.batch_size = batch_size
@@ -107,6 +107,16 @@ class TableTransformerModelTester:
        return config, pixel_values, pixel_mask, labels

    def get_config(self):
+        resnet_config = ResNetConfig(
+            num_channels=3,
+            embeddings_size=10,
+            hidden_sizes=[10, 20, 30, 40],
+            depths=[1, 1, 2, 1],
+            hidden_act="relu",
+            num_labels=3,
+            out_features=["stage2", "stage3", "stage4"],
+            out_indices=[2, 3, 4],
+        )
        return TableTransformerConfig(
            d_model=self.hidden_size,
            encoder_layers=self.num_hidden_layers,
@@ -119,6 +129,8 @@ class TableTransformerModelTester:
            attention_dropout=self.attention_probs_dropout_prob,
            num_queries=self.num_queries,
            num_labels=self.num_labels,
+            use_timm_backbone=False,
+            backbone_config=resnet_config,
        )

    def prepare_config_and_inputs_for_common(self):
@@ -175,19 +187,19 @@ class TableTransformerModelTester:
        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))


-@require_timm
+@require_torch
 class TableTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (
        (
            TableTransformerModel,
            TableTransformerForObjectDetection,
        )
-        if is_timm_available()
+        if is_torch_available()
        else ()
    )
    pipeline_model_mapping = (
        {"feature-extraction": TableTransformerModel, "object-detection": TableTransformerForObjectDetection}
-        if is_timm_available()
+        if is_torch_available()
        else {}
    )
    is_encoder_decoder = True
@@ -453,6 +465,7 @@ class TableTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, Pipelin

        # let's set num_channels to 1
        config.num_channels = 1
+        config.backbone_config.num_channels = 1

        for model_class in self.all_model_classes:
            model = model_class(config)
@@ -486,10 +499,6 @@ class TableTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, Pipelin
                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                        )

-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-

 TOLERANCE = 1e-4

--- a/tests/models/timm_backbone/test_modeling_timm_backbone.py
+++ b/tests/models/timm_backbone/test_modeling_timm_backbone.py
@@ -42,7 +42,7 @@ class TimmBackboneModelTester:
        out_indices=None,
        out_features=None,
        stage_names=None,
-        backbone="resnet50",
+        backbone="resnet18",
        batch_size=3,
        image_size=32,
        num_channels=3,
@@ -196,7 +196,7 @@ class TimmBackboneModelTest(ModelTesterMixin, BackboneTesterMixin, PipelineTeste
    def test_can_use_safetensors(self):
        pass

-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    @unittest.skip("Need to use a timm backbone and there is no tiny model available.")
    def test_model_is_small(self):
        pass

--- a/tests/models/tvlt/test_modeling_tvlt.py
+++ b/tests/models/tvlt/test_modeling_tvlt.py
@@ -67,8 +67,8 @@ class TvltModelTester:
        num_image_channels=3,
        num_audio_channels=1,
        num_frames=2,
-        hidden_size=128,
-        num_hidden_layers=12,
+        hidden_size=32,
+        num_hidden_layers=3,
        num_attention_heads=4,
        intermediate_size=128,
        hidden_act="gelu",
@@ -79,7 +79,7 @@ class TvltModelTester:
        qkv_bias=True,
        use_mean_pooling=True,
        decoder_num_attention_heads=4,
-        decoder_hidden_size=64,
+        decoder_hidden_size=32,
        decoder_num_hidden_layers=2,
        decoder_intermediate_size=128,
        image_mask_ratio=0.75,
@@ -542,10 +542,6 @@ class TvltModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):

            check_hidden_states_output(inputs_dict, config, model_class)

-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-

 # We will verify our results on a video of eating spaghetti
 # Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
--- a/tests/models/upernet/test_modeling_upernet.py
+++ b/tests/models/upernet/test_modeling_upernet.py
@@ -51,7 +51,7 @@ class UperNetModelTester:
        num_channels=3,
        num_stages=4,
        hidden_sizes=[10, 20, 30, 40],
-        depths=[2, 2, 3, 2],
+        depths=[1, 1, 1, 1],
        is_training=True,
        use_labels=True,
        intermediate_size=37,
@@ -106,12 +106,12 @@ class UperNetModelTester:
    def get_config(self):
        return UperNetConfig(
            backbone_config=self.get_backbone_config(),
-            hidden_size=512,
+            hidden_size=64,
            pool_scales=[1, 2, 3, 6],
            use_auxiliary_head=True,
            auxiliary_loss_weight=0.4,
            auxiliary_in_channels=40,
-            auxiliary_channels=256,
+            auxiliary_channels=32,
            auxiliary_num_convs=1,
            auxiliary_concat_input=False,
            loss_ignore_index=255,
@@ -207,10 +207,6 @@ class UperNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
    def test_multi_gpu_data_parallel_forward(self):
        pass

-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
    def test_hidden_states_output(self):
        def check_hidden_states_output(inputs_dict, config, model_class):
            model = model_class(config)
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -130,6 +130,10 @@ class VideoMAEModelTester:
            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            is_decoder=False,
            initializer_range=self.initializer_range,
+            decoder_hidden_size=self.hidden_size,
+            decoder_intermediate_size=self.intermediate_size,
+            decoder_num_attention_heads=self.num_attention_heads,
+            decoder_num_hidden_layers=self.num_hidden_layers,
        )

    def create_and_check_model(self, config, pixel_values, labels):
@@ -344,10 +348,6 @@ class VideoMAEModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase

            check_hidden_states_output(inputs_dict, config, model_class)

-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-

 # We will verify our results on a video of eating spaghetti
 # Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
--- a/tests/models/vit_mae/test_modeling_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_vit_mae.py
@@ -118,6 +118,10 @@ class ViTMAEModelTester:
            is_decoder=False,
            initializer_range=self.initializer_range,
            mask_ratio=self.mask_ratio,
+            decoder_hidden_size=self.hidden_size,
+            decoder_intermediate_size=self.intermediate_size,
+            decoder_num_attention_heads=self.num_attention_heads,
+            decoder_num_hidden_layers=self.num_hidden_layers,
        )

    def create_and_check_model(self, config, pixel_values, labels):
@@ -279,10 +283,6 @@ class ViTMAEModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
    def test_model_outputs_equivalence(self):
        pass

-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
    @slow
    def test_model_from_pretrained(self):
        for model_name in VIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
--- a/tests/models/vivit/test_modeling_vivit.py
+++ b/tests/models/vivit/test_modeling_vivit.py
@@ -55,8 +55,8 @@ class VivitModelTester:
        num_frames=8,  # decreased, because default 32 takes too much RAM at inference
        tubelet_size=[2, 4, 4],
        num_channels=3,
-        hidden_size=768,
-        num_hidden_layers=5,
+        hidden_size=32,
+        num_hidden_layers=2,
        num_attention_heads=4,
        intermediate_size=37,
        hidden_act="gelu_fast",
@@ -310,10 +310,6 @@ class VivitModelTest(ModelTesterMixin, unittest.TestCase):

            check_hidden_states_output(inputs_dict, config, model_class)

-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-

 # We will verify our results on a video of eating spaghetti
 # Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -2708,7 +2708,6 @@ class ModelTesterMixin:
    def test_model_is_small(self):
        # Just a consistency check to make sure we are not running tests on 80M parameter models.
        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        # print(config)

        for model_class in self.all_model_classes:
            model = model_class(config)