From bd90cda9a6bb4723515c17df1192e53abc8e36e3 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 2 Aug 2023 20:22:36 +0200
Subject: [PATCH] =?UTF-8?q?CI=20with=20`num=5Fhidden=5Flayers=3D2`=20?=
 =?UTF-8?q?=F0=9F=9A=80=F0=9F=9A=80=F0=9F=9A=80=20(#25266)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* CI with layers=2

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/albert/test_modeling_albert.py   |  5 ++--
 .../albert/test_modeling_flax_albert.py       |  2 +-
 tests/models/align/test_modeling_align.py     |  2 +-
 tests/models/altclip/test_modeling_altclip.py |  4 +--
 ..._modeling_audio_spectrogram_transformer.py |  2 +-
 tests/models/bart/test_modeling_bart.py       |  2 +-
 tests/models/beit/test_modeling_beit.py       |  2 +-
 tests/models/beit/test_modeling_flax_beit.py  |  2 +-
 tests/models/bert/test_modeling_bert.py       |  2 +-
 tests/models/bert/test_modeling_flax_bert.py  |  2 +-
 .../test_modeling_bert_generation.py          |  2 +-
 .../test_modeling_bigbird_pegasus.py          |  2 +-
 tests/models/biogpt/test_modeling_biogpt.py   |  2 +-
 .../blenderbot/test_modeling_blenderbot.py    |  2 +-
 .../test_modeling_blenderbot_small.py         |  2 +-
 tests/models/blip/test_modeling_blip.py       |  4 +--
 tests/models/blip/test_modeling_blip_text.py  |  2 +-
 tests/models/blip_2/test_modeling_blip_2.py   |  8 +++---
 tests/models/bloom/test_modeling_bloom.py     |  2 +-
 tests/models/canine/test_modeling_canine.py   |  2 +-
 .../test_modeling_chinese_clip.py             |  4 +--
 tests/models/clap/test_modeling_clap.py       |  2 +-
 tests/models/clip/test_modeling_clip.py       |  4 +--
 tests/models/clip/test_modeling_flax_clip.py  |  4 +--
 tests/models/clipseg/test_modeling_clipseg.py | 17 ++++++++++---
 tests/models/codegen/test_modeling_codegen.py |  2 +-
 .../models/convbert/test_modeling_convbert.py |  2 +-
 tests/models/cpmant/test_modeling_cpmant.py   |  2 +-
 tests/models/ctrl/test_modeling_ctrl.py       |  2 +-
 .../data2vec/test_modeling_data2vec_audio.py  |  2 +-
 .../data2vec/test_modeling_data2vec_text.py   |  2 +-
 .../data2vec/test_modeling_data2vec_vision.py |  2 +-
 tests/models/deberta/test_modeling_deberta.py |  2 +-
 .../deberta_v2/test_modeling_deberta_v2.py    |  2 +-
 tests/models/deit/test_modeling_deit.py       |  2 +-
 tests/models/dinov2/test_modeling_dinov2.py   |  2 +-
 .../distilbert/test_modeling_distilbert.py    |  2 +-
 .../test_modeling_flax_distilbert.py          |  2 +-
 tests/models/dpr/test_modeling_dpr.py         |  2 +-
 tests/models/dpt/test_modeling_dpt.py         |  4 +--
 tests/models/electra/test_modeling_electra.py |  2 +-
 .../electra/test_modeling_flax_electra.py     |  2 +-
 tests/models/ernie/test_modeling_ernie.py     |  2 +-
 tests/models/ernie_m/test_modeling_ernie_m.py |  2 +-
 tests/models/esm/test_modeling_esm.py         |  2 +-
 tests/models/esm/test_modeling_esmfold.py     |  2 +-
 tests/models/falcon/test_modeling_falcon.py   |  2 +-
 .../models/flaubert/test_modeling_flaubert.py |  2 +-
 tests/models/flava/test_modeling_flava.py     |  6 ++---
 tests/models/fnet/test_modeling_fnet.py       |  2 +-
 tests/models/git/test_modeling_git.py         |  4 +--
 tests/models/gpt2/test_modeling_flax_gpt2.py  |  2 +-
 tests/models/gpt2/test_modeling_gpt2.py       |  2 +-
 .../gpt_bigcode/test_modeling_gpt_bigcode.py  |  2 +-
 .../gpt_neo/test_modeling_flax_gpt_neo.py     |  4 +--
 tests/models/gpt_neo/test_modeling_gpt_neo.py |  4 +--
 .../models/gpt_neox/test_modeling_gpt_neox.py |  2 +-
 .../test_modeling_gpt_neox_japanese.py        |  2 +-
 tests/models/gptj/test_modeling_flax_gptj.py  |  2 +-
 tests/models/gptj/test_modeling_gptj.py       |  2 +-
 .../test_modeling_gptsan_japanese.py          |  2 +-
 .../models/groupvit/test_modeling_groupvit.py |  6 ++++-
 tests/models/hubert/test_modeling_hubert.py   |  2 +-
 tests/models/ibert/test_modeling_ibert.py     |  2 +-
 .../models/imagegpt/test_modeling_imagegpt.py |  2 +-
 .../test_modeling_instructblip.py             |  6 ++---
 .../models/layoutlm/test_modeling_layoutlm.py |  2 +-
 .../layoutlmv2/test_modeling_layoutlmv2.py    |  2 +-
 .../layoutlmv3/test_modeling_layoutlmv3.py    |  2 +-
 tests/models/llama/test_modeling_llama.py     |  2 +-
 .../longformer/test_modeling_longformer.py    |  2 +-
 .../longt5/test_modeling_flax_longt5.py       |  2 +-
 tests/models/longt5/test_modeling_longt5.py   |  4 +--
 tests/models/luke/test_modeling_luke.py       |  2 +-
 tests/models/marian/test_modeling_marian.py   |  2 +-
 .../models/markuplm/test_modeling_markuplm.py |  2 +-
 tests/models/mbart/test_modeling_mbart.py     |  2 +-
 tests/models/mega/test_modeling_mega.py       |  2 +-
 .../test_modeling_megatron_bert.py            |  2 +-
 tests/models/mgp_str/test_modeling_mgp_str.py |  2 +-
 .../mobilebert/test_modeling_mobilebert.py    |  2 +-
 tests/models/mpnet/test_modeling_mpnet.py     |  2 +-
 tests/models/mpt/test_modeling_mpt.py         |  2 +-
 tests/models/mra/test_modeling_mra.py         |  2 +-
 tests/models/mvp/test_modeling_mvp.py         |  2 +-
 tests/models/nezha/test_modeling_nezha.py     |  2 +-
 .../models/nllb_moe/test_modeling_nllb_moe.py |  2 +-
 .../test_modeling_nystromformer.py            |  2 +-
 tests/models/openai/test_modeling_openai.py   |  2 +-
 tests/models/opt/test_modeling_opt.py         |  2 +-
 tests/models/owlvit/test_modeling_owlvit.py   |  2 +-
 .../pegasus/test_modeling_flax_pegasus.py     |  2 +-
 tests/models/pegasus/test_modeling_pegasus.py |  2 +-
 .../pegasus_x/test_modeling_pegasus_x.py      |  2 +-
 .../pix2struct/test_modeling_pix2struct.py    |  4 +--
 tests/models/plbart/test_modeling_plbart.py   |  2 +-
 .../prophetnet/test_modeling_prophetnet.py    | 16 ++++++------
 tests/models/qdqbert/test_modeling_qdqbert.py |  2 +-
 tests/models/realm/test_modeling_realm.py     |  2 +-
 tests/models/rembert/test_modeling_rembert.py |  2 +-
 .../roberta/test_modeling_flax_roberta.py     |  2 +-
 tests/models/roberta/test_modeling_roberta.py |  2 +-
 ...test_modeling_flax_roberta_prelayernorm.py |  2 +-
 .../test_modeling_roberta_prelayernorm.py     |  2 +-
 .../models/roc_bert/test_modeling_roc_bert.py |  2 +-
 .../roformer/test_modeling_flax_roformer.py   |  2 +-
 .../models/roformer/test_modeling_roformer.py |  2 +-
 tests/models/rwkv/test_modeling_rwkv.py       |  2 +-
 tests/models/sew/test_modeling_sew.py         |  2 +-
 tests/models/sew_d/test_modeling_sew_d.py     |  2 +-
 .../test_modeling_speech_to_text_2.py         |  2 +-
 .../models/speecht5/test_modeling_speecht5.py |  8 +++---
 .../models/splinter/test_modeling_splinter.py |  2 +-
 .../squeezebert/test_modeling_squeezebert.py  |  2 +-
 .../test_modeling_switch_transformers.py      |  4 +--
 tests/models/t5/test_modeling_flax_t5.py      |  4 +--
 tests/models/t5/test_modeling_t5.py           |  4 +--
 tests/models/tapas/test_modeling_tapas.py     |  2 +-
 .../timesformer/test_modeling_timesformer.py  |  2 +-
 .../transfo_xl/test_modeling_transfo_xl.py    |  2 +-
 tests/models/trocr/test_modeling_trocr.py     |  2 +-
 tests/models/tvlt/test_modeling_tvlt.py       |  2 +-
 tests/models/umt5/test_modeling_umt5.py       |  2 +-
 .../unispeech/test_modeling_unispeech.py      |  2 +-
 .../test_modeling_unispeech_sat.py            |  2 +-
 .../models/videomae/test_modeling_videomae.py |  2 +-
 tests/models/vilt/test_modeling_vilt.py       |  2 +-
 .../visual_bert/test_modeling_visual_bert.py  |  2 +-
 tests/models/vit/test_modeling_flax_vit.py    |  2 +-
 tests/models/vit/test_modeling_vit.py         |  2 +-
 .../vit_hybrid/test_modeling_vit_hybrid.py    |  2 +-
 tests/models/vit_mae/test_modeling_vit_mae.py |  2 +-
 tests/models/vit_msn/test_modeling_vit_msn.py |  2 +-
 .../wav2vec2/test_modeling_flax_wav2vec2.py   |  2 +-
 .../models/wav2vec2/test_modeling_wav2vec2.py |  2 +-
 .../test_modeling_wav2vec2_conformer.py       |  2 +-
 tests/models/wavlm/test_modeling_wavlm.py     |  2 +-
 tests/models/x_clip/test_modeling_x_clip.py   |  4 +--
 tests/models/xglm/test_modeling_flax_xglm.py  |  2 +-
 tests/models/xglm/test_modeling_xglm.py       |  2 +-
 tests/models/xlm/test_modeling_xlm.py         |  2 +-
 .../test_modeling_xlm_roberta_xl.py           |  2 +-
 tests/models/xlnet/test_modeling_xlnet.py     |  2 +-
 tests/models/xmod/test_modeling_xmod.py       |  2 +-
 tests/models/yolos/test_modeling_yolos.py     |  2 +-
 tests/models/yoso/test_modeling_yoso.py       |  2 +-
 tests/test_modeling_common.py                 | 25 ++++++++-----------
 147 files changed, 207 insertions(+), 196 deletions(-)

diff --git a/tests/models/albert/test_modeling_albert.py b/tests/models/albert/test_modeling_albert.py
index 96fa5596a2..75c84ad0d3 100644
--- a/tests/models/albert/test_modeling_albert.py
+++ b/tests/models/albert/test_modeling_albert.py
@@ -54,8 +54,9 @@ class AlbertModelTester:
         vocab_size=99,
         embedding_size=16,
         hidden_size=36,
-        num_hidden_layers=6,
-        num_hidden_groups=6,
+        num_hidden_layers=2,
+        # this needs to be the same as `num_hidden_layers`!
+        num_hidden_groups=2,
         num_attention_heads=6,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/albert/test_modeling_flax_albert.py b/tests/models/albert/test_modeling_flax_albert.py
index 5292665f55..0bdc8065bc 100644
--- a/tests/models/albert/test_modeling_flax_albert.py
+++ b/tests/models/albert/test_modeling_flax_albert.py
@@ -48,7 +48,7 @@ class FlaxAlbertModelTester(unittest.TestCase):
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py
index 35b2aebf6c..47918bcd83 100644
--- a/tests/models/align/test_modeling_align.py
+++ b/tests/models/align/test_modeling_align.py
@@ -242,7 +242,7 @@ class AlignTextModelTester:
         use_token_type_ids=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py
index fdae7768da..244ef1ed3b 100755
--- a/tests/models/altclip/test_modeling_altclip.py
+++ b/tests/models/altclip/test_modeling_altclip.py
@@ -60,7 +60,7 @@ class AltCLIPVisionModelTester:
         is_training=True,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -212,7 +212,7 @@ class AltCLIPTextModelTester:
         hidden_size=32,
         projection_dim=32,
         project_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
diff --git a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
index c36e946f19..ce596d84e3 100644
--- a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
+++ b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
@@ -55,7 +55,7 @@ class ASTModelTester:
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/bart/test_modeling_bart.py b/tests/models/bart/test_modeling_bart.py
index 949e647e6f..01189e5628 100644
--- a/tests/models/bart/test_modeling_bart.py
+++ b/tests/models/bart/test_modeling_bart.py
@@ -1289,7 +1289,7 @@ class BartStandaloneDecoderModelTester:
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
diff --git a/tests/models/beit/test_modeling_beit.py b/tests/models/beit/test_modeling_beit.py
index 4c0e5e5a45..2a35cddf40 100644
--- a/tests/models/beit/test_modeling_beit.py
+++ b/tests/models/beit/test_modeling_beit.py
@@ -64,7 +64,7 @@ class BeitModelTester:
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/beit/test_modeling_flax_beit.py b/tests/models/beit/test_modeling_flax_beit.py
index 0587174bac..78c24220c2 100644
--- a/tests/models/beit/test_modeling_flax_beit.py
+++ b/tests/models/beit/test_modeling_flax_beit.py
@@ -48,7 +48,7 @@ class FlaxBeitModelTester(unittest.TestCase):
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py
index 52c8035d8a..9aec91367d 100644
--- a/tests/models/bert/test_modeling_bert.py
+++ b/tests/models/bert/test_modeling_bert.py
@@ -57,7 +57,7 @@ class BertModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/bert/test_modeling_flax_bert.py b/tests/models/bert/test_modeling_flax_bert.py
index 55ffb44019..8226899175 100644
--- a/tests/models/bert/test_modeling_flax_bert.py
+++ b/tests/models/bert/test_modeling_flax_bert.py
@@ -47,7 +47,7 @@ class FlaxBertModelTester(unittest.TestCase):
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/bert_generation/test_modeling_bert_generation.py b/tests/models/bert_generation/test_modeling_bert_generation.py
index ced98e6f72..ecd7a459e0 100644
--- a/tests/models/bert_generation/test_modeling_bert_generation.py
+++ b/tests/models/bert_generation/test_modeling_bert_generation.py
@@ -41,7 +41,7 @@ class BertGenerationEncoderTester:
         use_input_mask=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
index 5d345db3fc..aedbbb4634 100644
--- a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
+++ b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
@@ -605,7 +605,7 @@ class BigBirdPegasusStandaloneDecoderModelTester:
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
diff --git a/tests/models/biogpt/test_modeling_biogpt.py b/tests/models/biogpt/test_modeling_biogpt.py
index e504109308..e43fc1e41b 100644
--- a/tests/models/biogpt/test_modeling_biogpt.py
+++ b/tests/models/biogpt/test_modeling_biogpt.py
@@ -51,7 +51,7 @@ class BioGptModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/blenderbot/test_modeling_blenderbot.py b/tests/models/blenderbot/test_modeling_blenderbot.py
index 499c7aa521..ca1630b3cf 100644
--- a/tests/models/blenderbot/test_modeling_blenderbot.py
+++ b/tests/models/blenderbot/test_modeling_blenderbot.py
@@ -356,7 +356,7 @@ class BlenderbotStandaloneDecoderModelTester:
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
diff --git a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
index 257aa1699c..249a8a799a 100644
--- a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
@@ -365,7 +365,7 @@ class BlenderbotSmallStandaloneDecoderModelTester:
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py
index 9a6e3da06c..cf8c487082 100644
--- a/tests/models/blip/test_modeling_blip.py
+++ b/tests/models/blip/test_modeling_blip.py
@@ -70,7 +70,7 @@ class BlipVisionModelTester:
         is_training=True,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -221,7 +221,7 @@ class BlipTextModelTester:
         vocab_size=99,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
diff --git a/tests/models/blip/test_modeling_blip_text.py b/tests/models/blip/test_modeling_blip_text.py
index 488512f6e2..2301b776fe 100644
--- a/tests/models/blip/test_modeling_blip_text.py
+++ b/tests/models/blip/test_modeling_blip_text.py
@@ -44,7 +44,7 @@ class BlipTextModelTester:
         vocab_size=99,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
index 71f652050f..c5bdb70791 100644
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -62,7 +62,7 @@ class Blip2VisionModelTester:
         is_training=True,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -215,7 +215,7 @@ class Blip2QFormerModelTester:
         vocab_size=99,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=6,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -289,7 +289,7 @@ class Blip2TextModelDecoderOnlyTester:
         use_labels=False,
         vocab_size=99,
         hidden_size=16,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=4,
         hidden_act="gelu",
@@ -503,7 +503,7 @@ class Blip2TextModelTester:
         use_attention_mask=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
diff --git a/tests/models/bloom/test_modeling_bloom.py b/tests/models/bloom/test_modeling_bloom.py
index 4e9b837c8a..de7cb03e7e 100644
--- a/tests/models/bloom/test_modeling_bloom.py
+++ b/tests/models/bloom/test_modeling_bloom.py
@@ -54,7 +54,7 @@ class BloomModelTester:
         use_mc_token_ids=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/canine/test_modeling_canine.py b/tests/models/canine/test_modeling_canine.py
index 057fc09131..303d465ca9 100644
--- a/tests/models/canine/test_modeling_canine.py
+++ b/tests/models/canine/test_modeling_canine.py
@@ -53,7 +53,7 @@ class CanineModelTester:
         # NOTE: this is not a model parameter, just an input
         vocab_size=100000,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py
index 894f1e1279..137c3c2888 100644
--- a/tests/models/chinese_clip/test_modeling_chinese_clip.py
+++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py
@@ -69,7 +69,7 @@ class ChineseCLIPTextModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -246,7 +246,7 @@ class ChineseCLIPVisionModelTester:
         is_training=True,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index 35c0f4e203..dc5718850f 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -287,7 +287,7 @@ class ClapTextModelTester:
         vocab_size=99,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py
index 4239ce5ed0..996bea95b9 100644
--- a/tests/models/clip/test_modeling_clip.py
+++ b/tests/models/clip/test_modeling_clip.py
@@ -86,7 +86,7 @@ class CLIPVisionModelTester:
         is_training=True,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -261,7 +261,7 @@ class CLIPTextModelTester:
         vocab_size=99,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
diff --git a/tests/models/clip/test_modeling_flax_clip.py b/tests/models/clip/test_modeling_flax_clip.py
index 7d63fa9edf..565c641aef 100644
--- a/tests/models/clip/test_modeling_flax_clip.py
+++ b/tests/models/clip/test_modeling_flax_clip.py
@@ -35,7 +35,7 @@ class FlaxCLIPVisionModelTester:
         num_channels=3,
         is_training=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -252,7 +252,7 @@ class FlaxCLIPTextModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index e931bdc8d5..37a71d1b18 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -78,7 +78,7 @@ class CLIPSegVisionModelTester:
         num_channels=3,
         is_training=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -228,7 +228,7 @@ class CLIPSegTextModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -346,7 +346,15 @@ class CLIPSegTextModelTest(ModelTesterMixin, unittest.TestCase):
 
 
 class CLIPSegModelTester:
-    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
+    def __init__(
+        self,
+        parent,
+        text_kwargs=None,
+        vision_kwargs=None,
+        is_training=True,
+        # This should respect the `num_hidden_layers` in `CLIPSegVisionModelTester`
+        extract_layers=(1,),
+    ):
         if text_kwargs is None:
             text_kwargs = {}
         if vision_kwargs is None:
@@ -356,6 +364,7 @@ class CLIPSegModelTester:
         self.text_model_tester = CLIPSegTextModelTester(parent, **text_kwargs)
         self.vision_model_tester = CLIPSegVisionModelTester(parent, **vision_kwargs)
         self.is_training = is_training
+        self.extract_layers = extract_layers
 
     def prepare_config_and_inputs(self):
         text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
@@ -371,7 +380,7 @@ class CLIPSegModelTester:
             self.vision_model_tester.get_config(),
             projection_dim=64,
             reduce_dim=32,
-            extract_layers=[1, 2, 3],
+            extract_layers=self.extract_layers,
         )
 
     def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
diff --git a/tests/models/codegen/test_modeling_codegen.py b/tests/models/codegen/test_modeling_codegen.py
index 9072c2b5bc..34a32caa7f 100644
--- a/tests/models/codegen/test_modeling_codegen.py
+++ b/tests/models/codegen/test_modeling_codegen.py
@@ -47,7 +47,7 @@ class CodeGenModelTester:
         vocab_size=256,
         hidden_size=32,
         rotary_dim=4,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/convbert/test_modeling_convbert.py b/tests/models/convbert/test_modeling_convbert.py
index dc1550acc2..754967ce00 100644
--- a/tests/models/convbert/test_modeling_convbert.py
+++ b/tests/models/convbert/test_modeling_convbert.py
@@ -53,7 +53,7 @@ class ConvBertModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/cpmant/test_modeling_cpmant.py b/tests/models/cpmant/test_modeling_cpmant.py
index f3ebeaad3e..6ecfe15c2e 100644
--- a/tests/models/cpmant/test_modeling_cpmant.py
+++ b/tests/models/cpmant/test_modeling_cpmant.py
@@ -49,7 +49,7 @@ class CpmAntModelTester:
         use_mc_token_ids=False,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=3,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         num_buckets=32,
diff --git a/tests/models/ctrl/test_modeling_ctrl.py b/tests/models/ctrl/test_modeling_ctrl.py
index dfcb2c9133..ff42744415 100644
--- a/tests/models/ctrl/test_modeling_ctrl.py
+++ b/tests/models/ctrl/test_modeling_ctrl.py
@@ -49,7 +49,7 @@ class CTRLModelTester:
         use_mc_token_ids=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/data2vec/test_modeling_data2vec_audio.py b/tests/models/data2vec/test_modeling_data2vec_audio.py
index 67fe0cbe70..e9448621e9 100644
--- a/tests/models/data2vec/test_modeling_data2vec_audio.py
+++ b/tests/models/data2vec/test_modeling_data2vec_audio.py
@@ -59,7 +59,7 @@ class Data2VecAudioModelTester:
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,
         intermediate_size=20,
diff --git a/tests/models/data2vec/test_modeling_data2vec_text.py b/tests/models/data2vec/test_modeling_data2vec_text.py
index a45c9b6a8b..4b4b2835dc 100644
--- a/tests/models/data2vec/test_modeling_data2vec_text.py
+++ b/tests/models/data2vec/test_modeling_data2vec_text.py
@@ -57,7 +57,7 @@ class Data2VecTextModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/data2vec/test_modeling_data2vec_vision.py b/tests/models/data2vec/test_modeling_data2vec_vision.py
index 299ffad3e7..69a763a4f2 100644
--- a/tests/models/data2vec/test_modeling_data2vec_vision.py
+++ b/tests/models/data2vec/test_modeling_data2vec_vision.py
@@ -59,7 +59,7 @@ class Data2VecVisionModelTester:
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/deberta/test_modeling_deberta.py b/tests/models/deberta/test_modeling_deberta.py
index 7daff3b522..52758e2222 100644
--- a/tests/models/deberta/test_modeling_deberta.py
+++ b/tests/models/deberta/test_modeling_deberta.py
@@ -47,7 +47,7 @@ class DebertaModelTester(object):
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/deberta_v2/test_modeling_deberta_v2.py b/tests/models/deberta_v2/test_modeling_deberta_v2.py
index 548c9617b8..abfbe7402c 100644
--- a/tests/models/deberta_v2/test_modeling_deberta_v2.py
+++ b/tests/models/deberta_v2/test_modeling_deberta_v2.py
@@ -48,7 +48,7 @@ class DebertaV2ModelTester(object):
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/deit/test_modeling_deit.py b/tests/models/deit/test_modeling_deit.py
index 37bfe3fa70..2685900afb 100644
--- a/tests/models/deit/test_modeling_deit.py
+++ b/tests/models/deit/test_modeling_deit.py
@@ -69,7 +69,7 @@ class DeiTModelTester:
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/dinov2/test_modeling_dinov2.py b/tests/models/dinov2/test_modeling_dinov2.py
index ed69faa444..cf7ff95b57 100644
--- a/tests/models/dinov2/test_modeling_dinov2.py
+++ b/tests/models/dinov2/test_modeling_dinov2.py
@@ -57,7 +57,7 @@ class Dinov2ModelTester:
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/distilbert/test_modeling_distilbert.py b/tests/models/distilbert/test_modeling_distilbert.py
index 9d17a1c441..ff56afd0a9 100644
--- a/tests/models/distilbert/test_modeling_distilbert.py
+++ b/tests/models/distilbert/test_modeling_distilbert.py
@@ -50,7 +50,7 @@ class DistilBertModelTester(object):
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/distilbert/test_modeling_flax_distilbert.py b/tests/models/distilbert/test_modeling_flax_distilbert.py
index f4481a6e4a..1f5a402e86 100644
--- a/tests/models/distilbert/test_modeling_flax_distilbert.py
+++ b/tests/models/distilbert/test_modeling_flax_distilbert.py
@@ -47,7 +47,7 @@ class FlaxDistilBertModelTester(unittest.TestCase):
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/dpr/test_modeling_dpr.py b/tests/models/dpr/test_modeling_dpr.py
index cd4f430ded..b6a687a351 100644
--- a/tests/models/dpr/test_modeling_dpr.py
+++ b/tests/models/dpr/test_modeling_dpr.py
@@ -48,7 +48,7 @@ class DPRModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/dpt/test_modeling_dpt.py b/tests/models/dpt/test_modeling_dpt.py
index 62ac20df31..247791ed41 100644
--- a/tests/models/dpt/test_modeling_dpt.py
+++ b/tests/models/dpt/test_modeling_dpt.py
@@ -53,7 +53,7 @@ class DPTModelTester:
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         backbone_out_indices=[0, 1, 2, 3],
         num_attention_heads=4,
         intermediate_size=37,
@@ -62,7 +62,7 @@ class DPTModelTester:
         attention_probs_dropout_prob=0.1,
         initializer_range=0.02,
         num_labels=3,
-        neck_hidden_sizes=[16, 16, 32, 32],
+        neck_hidden_sizes=[16, 32],
         is_hybrid=False,
         scope=None,
     ):
diff --git a/tests/models/electra/test_modeling_electra.py b/tests/models/electra/test_modeling_electra.py
index 550bc14487..a5d3fa585e 100644
--- a/tests/models/electra/test_modeling_electra.py
+++ b/tests/models/electra/test_modeling_electra.py
@@ -54,7 +54,7 @@ class ElectraModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/electra/test_modeling_flax_electra.py b/tests/models/electra/test_modeling_flax_electra.py
index 0dda4e38fd..19b35d8940 100644
--- a/tests/models/electra/test_modeling_flax_electra.py
+++ b/tests/models/electra/test_modeling_flax_electra.py
@@ -34,7 +34,7 @@ class FlaxElectraModelTester(unittest.TestCase):
         vocab_size=99,
         embedding_size=24,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/ernie/test_modeling_ernie.py b/tests/models/ernie/test_modeling_ernie.py
index e845bd1f83..f0bdec3efb 100644
--- a/tests/models/ernie/test_modeling_ernie.py
+++ b/tests/models/ernie/test_modeling_ernie.py
@@ -56,7 +56,7 @@ class ErnieModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/ernie_m/test_modeling_ernie_m.py b/tests/models/ernie_m/test_modeling_ernie_m.py
index 5e0ac95233..1fafcd34ba 100644
--- a/tests/models/ernie_m/test_modeling_ernie_m.py
+++ b/tests/models/ernie_m/test_modeling_ernie_m.py
@@ -50,7 +50,7 @@ class ErnieMModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/esm/test_modeling_esm.py b/tests/models/esm/test_modeling_esm.py
index 2e5d48082b..f242e77966 100644
--- a/tests/models/esm/test_modeling_esm.py
+++ b/tests/models/esm/test_modeling_esm.py
@@ -49,7 +49,7 @@ class EsmModelTester:
         use_labels=True,
         vocab_size=33,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/esm/test_modeling_esmfold.py b/tests/models/esm/test_modeling_esmfold.py
index 39f274af54..1ec5ab8acb 100644
--- a/tests/models/esm/test_modeling_esmfold.py
+++ b/tests/models/esm/test_modeling_esmfold.py
@@ -43,7 +43,7 @@ class EsmFoldModelTester:
         use_labels=False,
         vocab_size=19,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/falcon/test_modeling_falcon.py b/tests/models/falcon/test_modeling_falcon.py
index 6530eeb1a1..0efc762371 100644
--- a/tests/models/falcon/test_modeling_falcon.py
+++ b/tests/models/falcon/test_modeling_falcon.py
@@ -50,7 +50,7 @@ class FalconModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/flaubert/test_modeling_flaubert.py b/tests/models/flaubert/test_modeling_flaubert.py
index 99dbf927e1..61806182bb 100644
--- a/tests/models/flaubert/test_modeling_flaubert.py
+++ b/tests/models/flaubert/test_modeling_flaubert.py
@@ -57,7 +57,7 @@ class FlaubertModelTester(object):
         vocab_size=99,
         n_special=0,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py
index f1221f1061..0224181637 100644
--- a/tests/models/flava/test_modeling_flava.py
+++ b/tests/models/flava/test_modeling_flava.py
@@ -79,7 +79,7 @@ class FlavaImageModelTester:
         parent,
         batch_size=12,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -342,7 +342,7 @@ class FlavaTextModelTester:
         max_position_embeddings=512,
         position_embedding_type="absolute",
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -487,7 +487,7 @@ class FlavaMultimodalModelTester:
         seq_length=44,
         use_input_mask=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/fnet/test_modeling_fnet.py b/tests/models/fnet/test_modeling_fnet.py
index e7e592d5b6..01e9942de2 100644
--- a/tests/models/fnet/test_modeling_fnet.py
+++ b/tests/models/fnet/test_modeling_fnet.py
@@ -70,7 +70,7 @@ class FNetModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         intermediate_size=37,
         hidden_act="gelu",
         hidden_dropout_prob=0.1,
diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py
index ed094db4a0..0dde54a398 100644
--- a/tests/models/git/test_modeling_git.py
+++ b/tests/models/git/test_modeling_git.py
@@ -51,7 +51,7 @@ class GitVisionModelTester:
         is_training=True,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -203,7 +203,7 @@ class GitModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/gpt2/test_modeling_flax_gpt2.py b/tests/models/gpt2/test_modeling_flax_gpt2.py
index e842bbc732..9bdc17fa19 100644
--- a/tests/models/gpt2/test_modeling_flax_gpt2.py
+++ b/tests/models/gpt2/test_modeling_flax_gpt2.py
@@ -52,7 +52,7 @@ class FlaxGPT2ModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py
index f820b54942..c941039888 100644
--- a/tests/models/gpt2/test_modeling_gpt2.py
+++ b/tests/models/gpt2/test_modeling_gpt2.py
@@ -56,7 +56,7 @@ class GPT2ModelTester:
         use_mc_token_ids=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
index 8beddc0aba..3d4dd27fa4 100644
--- a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
+++ b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
@@ -55,7 +55,7 @@ class GPTBigCodeModelTester:
         use_mc_token_ids=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="relu",
diff --git a/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py b/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
index a32f35f6e7..58574a8b1d 100644
--- a/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
+++ b/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
@@ -52,9 +52,9 @@ class FlaxGPTNeoModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=4,
-        attention_types=[[["global", "local"], 2]],
+        attention_types=[[["global", "local"], 1]],
         intermediate_size=37,
         hidden_act="gelu",
         hidden_dropout_prob=0.1,
diff --git a/tests/models/gpt_neo/test_modeling_gpt_neo.py b/tests/models/gpt_neo/test_modeling_gpt_neo.py
index a79cf5b25d..075b9a2663 100644
--- a/tests/models/gpt_neo/test_modeling_gpt_neo.py
+++ b/tests/models/gpt_neo/test_modeling_gpt_neo.py
@@ -54,8 +54,8 @@ class GPTNeoModelTester:
         use_mc_token_ids=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=4,
-        attention_types=[[["global", "local"], 2]],
+        num_hidden_layers=2,
+        attention_types=[[["global", "local"], 1]],
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/models/gpt_neox/test_modeling_gpt_neox.py
index 176970779e..8777bd3abd 100644
--- a/tests/models/gpt_neox/test_modeling_gpt_neox.py
+++ b/tests/models/gpt_neox/test_modeling_gpt_neox.py
@@ -52,7 +52,7 @@ class GPTNeoXModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=64,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py b/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py
index 47bb22b627..fc78b8bdd4 100644
--- a/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py
+++ b/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py
@@ -44,7 +44,7 @@ class GPTNeoXJapaneseModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_multiple_size=4,
         hidden_act="gelu",
diff --git a/tests/models/gptj/test_modeling_flax_gptj.py b/tests/models/gptj/test_modeling_flax_gptj.py
index d177e345e8..48061f84d8 100644
--- a/tests/models/gptj/test_modeling_flax_gptj.py
+++ b/tests/models/gptj/test_modeling_flax_gptj.py
@@ -53,7 +53,7 @@ class FlaxGPTJModelTester:
         vocab_size=99,
         hidden_size=32,
         rotary_dim=4,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/gptj/test_modeling_gptj.py b/tests/models/gptj/test_modeling_gptj.py
index 3636d357d5..f0e0270070 100644
--- a/tests/models/gptj/test_modeling_gptj.py
+++ b/tests/models/gptj/test_modeling_gptj.py
@@ -56,7 +56,7 @@ class GPTJModelTester:
         vocab_size=99,
         hidden_size=32,
         rotary_dim=4,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py b/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
index 54a98cf70f..1a86e23fdc 100644
--- a/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
+++ b/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
@@ -45,7 +45,7 @@ class GPTSanJapaneseTester:
         is_training=True,
         hidden_size=32,
         ext_size=42,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_ext_layers=2,
         num_attention_heads=4,
         num_experts=2,
diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py
index 261841277a..6d52b6b501 100644
--- a/tests/models/groupvit/test_modeling_groupvit.py
+++ b/tests/models/groupvit/test_modeling_groupvit.py
@@ -356,7 +356,7 @@ class GroupViTTextModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -553,6 +553,10 @@ class GroupViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
     def test_model_common_attributes(self):
         pass
 
+    # overwritten from parent as this equivalent test needs a specific `seed` and hard to get a good one!
+    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=2e-5, name="outputs", attributes=None):
+        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol=tol, name=name, attributes=attributes)
+
     @is_pt_tf_cross_test
     def test_pt_tf_model_equivalence(self):
         import tensorflow as tf
diff --git a/tests/models/hubert/test_modeling_hubert.py b/tests/models/hubert/test_modeling_hubert.py
index bad1a561da..c5a6a1398f 100644
--- a/tests/models/hubert/test_modeling_hubert.py
+++ b/tests/models/hubert/test_modeling_hubert.py
@@ -71,7 +71,7 @@ class HubertModelTester:
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
         intermediate_size=20,
diff --git a/tests/models/ibert/test_modeling_ibert.py b/tests/models/ibert/test_modeling_ibert.py
index 9f2f2c9502..096a55169a 100644
--- a/tests/models/ibert/test_modeling_ibert.py
+++ b/tests/models/ibert/test_modeling_ibert.py
@@ -62,7 +62,7 @@ class IBertModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/imagegpt/test_modeling_imagegpt.py b/tests/models/imagegpt/test_modeling_imagegpt.py
index 19fe688bf4..b4e2cd5ab4 100644
--- a/tests/models/imagegpt/test_modeling_imagegpt.py
+++ b/tests/models/imagegpt/test_modeling_imagegpt.py
@@ -65,7 +65,7 @@ class ImageGPTModelTester:
         use_mc_token_ids=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py
index d659c38916..49d780918c 100644
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -64,7 +64,7 @@ class InstructBlipVisionModelTester:
         is_training=True,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -219,7 +219,7 @@ class InstructBlipQFormerModelTester:
         vocab_size=99,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=6,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -295,7 +295,7 @@ class InstructBlipTextModelDecoderOnlyTester:
         use_labels=False,
         vocab_size=99,
         hidden_size=16,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=4,
         hidden_act="gelu",
diff --git a/tests/models/layoutlm/test_modeling_layoutlm.py b/tests/models/layoutlm/test_modeling_layoutlm.py
index 0535fbf4e1..aafa53969d 100644
--- a/tests/models/layoutlm/test_modeling_layoutlm.py
+++ b/tests/models/layoutlm/test_modeling_layoutlm.py
@@ -48,7 +48,7 @@ class LayoutLMModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
index c8457331c5..cffa09d6d0 100644
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -55,7 +55,7 @@ class LayoutLMv2ModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=36,
-        num_hidden_layers=3,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
index 2c3aef9b93..bf9a0b8314 100644
--- a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
@@ -63,7 +63,7 @@ class LayoutLMv3ModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=36,
-        num_hidden_layers=3,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
index c2efc3f5a3..e8d5f9abe6 100644
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -46,7 +46,7 @@ class LlamaModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/longformer/test_modeling_longformer.py b/tests/models/longformer/test_modeling_longformer.py
index 21853e4420..b40e464e60 100644
--- a/tests/models/longformer/test_modeling_longformer.py
+++ b/tests/models/longformer/test_modeling_longformer.py
@@ -50,7 +50,7 @@ class LongformerModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/longt5/test_modeling_flax_longt5.py b/tests/models/longt5/test_modeling_flax_longt5.py
index 2c262bef30..9449cfa5e3 100644
--- a/tests/models/longt5/test_modeling_flax_longt5.py
+++ b/tests/models/longt5/test_modeling_flax_longt5.py
@@ -71,7 +71,7 @@ class FlaxLongT5ModelTester:
         use_attention_mask=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
diff --git a/tests/models/longt5/test_modeling_longt5.py b/tests/models/longt5/test_modeling_longt5.py
index 0f7ae0a272..b2d17dc0e6 100644
--- a/tests/models/longt5/test_modeling_longt5.py
+++ b/tests/models/longt5/test_modeling_longt5.py
@@ -59,7 +59,7 @@ class LongT5ModelTester:
         use_attention_mask=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
@@ -916,7 +916,7 @@ class LongT5EncoderOnlyModelTester:
         # For common tests
         use_attention_mask=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
diff --git a/tests/models/luke/test_modeling_luke.py b/tests/models/luke/test_modeling_luke.py
index 35bdb6b6d5..95cb4f5d01 100644
--- a/tests/models/luke/test_modeling_luke.py
+++ b/tests/models/luke/test_modeling_luke.py
@@ -61,7 +61,7 @@ class LukeModelTester:
         entity_vocab_size=10,
         entity_emb_size=6,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/marian/test_modeling_marian.py b/tests/models/marian/test_modeling_marian.py
index 6cbcd55d3f..8fd5e04a56 100644
--- a/tests/models/marian/test_modeling_marian.py
+++ b/tests/models/marian/test_modeling_marian.py
@@ -661,7 +661,7 @@ class MarianStandaloneDecoderModelTester:
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
diff --git a/tests/models/markuplm/test_modeling_markuplm.py b/tests/models/markuplm/test_modeling_markuplm.py
index 09d2f1ad52..71757385e8 100644
--- a/tests/models/markuplm/test_modeling_markuplm.py
+++ b/tests/models/markuplm/test_modeling_markuplm.py
@@ -53,7 +53,7 @@ class MarkupLMModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/mbart/test_modeling_mbart.py b/tests/models/mbart/test_modeling_mbart.py
index ec3d36f33d..db5b554e82 100644
--- a/tests/models/mbart/test_modeling_mbart.py
+++ b/tests/models/mbart/test_modeling_mbart.py
@@ -491,7 +491,7 @@ class MBartStandaloneDecoderModelTester:
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
diff --git a/tests/models/mega/test_modeling_mega.py b/tests/models/mega/test_modeling_mega.py
index dfb00d190f..e10ecc5487 100644
--- a/tests/models/mega/test_modeling_mega.py
+++ b/tests/models/mega/test_modeling_mega.py
@@ -51,7 +51,7 @@ class MegaModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         intermediate_size=37,
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
diff --git a/tests/models/megatron_bert/test_modeling_megatron_bert.py b/tests/models/megatron_bert/test_modeling_megatron_bert.py
index bc1d81c4e0..818f65d80c 100644
--- a/tests/models/megatron_bert/test_modeling_megatron_bert.py
+++ b/tests/models/megatron_bert/test_modeling_megatron_bert.py
@@ -58,7 +58,7 @@ class MegatronBertModelTester:
         vocab_size=99,
         hidden_size=64,
         embedding_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/mgp_str/test_modeling_mgp_str.py b/tests/models/mgp_str/test_modeling_mgp_str.py
index 1d972e22a3..d8ba50a350 100644
--- a/tests/models/mgp_str/test_modeling_mgp_str.py
+++ b/tests/models/mgp_str/test_modeling_mgp_str.py
@@ -55,7 +55,7 @@ class MgpstrModelTester:
         num_bpe_labels=99,
         num_wordpiece_labels=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         mlp_ratio=4.0,
         patch_embeds_hidden_size=257,
diff --git a/tests/models/mobilebert/test_modeling_mobilebert.py b/tests/models/mobilebert/test_modeling_mobilebert.py
index 6e4f696b8b..572490071a 100644
--- a/tests/models/mobilebert/test_modeling_mobilebert.py
+++ b/tests/models/mobilebert/test_modeling_mobilebert.py
@@ -54,7 +54,7 @@ class MobileBertModelTester:
         vocab_size=99,
         hidden_size=64,
         embedding_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/mpnet/test_modeling_mpnet.py b/tests/models/mpnet/test_modeling_mpnet.py
index d3261e4bc0..fc16764174 100644
--- a/tests/models/mpnet/test_modeling_mpnet.py
+++ b/tests/models/mpnet/test_modeling_mpnet.py
@@ -49,7 +49,7 @@ class MPNetModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=64,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=64,
         hidden_act="gelu",
diff --git a/tests/models/mpt/test_modeling_mpt.py b/tests/models/mpt/test_modeling_mpt.py
index 91cb35bb7a..363c493b1e 100644
--- a/tests/models/mpt/test_modeling_mpt.py
+++ b/tests/models/mpt/test_modeling_mpt.py
@@ -54,7 +54,7 @@ class MptModelTester:
         use_mc_token_ids=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/mra/test_modeling_mra.py b/tests/models/mra/test_modeling_mra.py
index c6a0813032..aac9ce5bc1 100644
--- a/tests/models/mra/test_modeling_mra.py
+++ b/tests/models/mra/test_modeling_mra.py
@@ -51,7 +51,7 @@ class MraModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=16,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=2,
         intermediate_size=36,
         hidden_act="gelu",
diff --git a/tests/models/mvp/test_modeling_mvp.py b/tests/models/mvp/test_modeling_mvp.py
index cc3986a370..8e6143529a 100644
--- a/tests/models/mvp/test_modeling_mvp.py
+++ b/tests/models/mvp/test_modeling_mvp.py
@@ -595,7 +595,7 @@ class MvpStandaloneDecoderModelTester:
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
diff --git a/tests/models/nezha/test_modeling_nezha.py b/tests/models/nezha/test_modeling_nezha.py
index 5b36ffbc96..a71823d8a5 100644
--- a/tests/models/nezha/test_modeling_nezha.py
+++ b/tests/models/nezha/test_modeling_nezha.py
@@ -55,7 +55,7 @@ class NezhaModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/nllb_moe/test_modeling_nllb_moe.py b/tests/models/nllb_moe/test_modeling_nllb_moe.py
index 030b5f2a89..9311a01990 100644
--- a/tests/models/nllb_moe/test_modeling_nllb_moe.py
+++ b/tests/models/nllb_moe/test_modeling_nllb_moe.py
@@ -52,7 +52,7 @@ class NllbMoeModelTester:
         use_labels=False,
         vocab_size=99,
         hidden_size=16,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=4,
         hidden_act="relu",
diff --git a/tests/models/nystromformer/test_modeling_nystromformer.py b/tests/models/nystromformer/test_modeling_nystromformer.py
index 390308631d..ae06670103 100644
--- a/tests/models/nystromformer/test_modeling_nystromformer.py
+++ b/tests/models/nystromformer/test_modeling_nystromformer.py
@@ -51,7 +51,7 @@ class NystromformerModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/openai/test_modeling_openai.py b/tests/models/openai/test_modeling_openai.py
index 0e8ba6d9ce..98d74ee5f8 100644
--- a/tests/models/openai/test_modeling_openai.py
+++ b/tests/models/openai/test_modeling_openai.py
@@ -49,7 +49,7 @@ class OpenAIGPTModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/opt/test_modeling_opt.py b/tests/models/opt/test_modeling_opt.py
index 251282a91b..69a063f276 100644
--- a/tests/models/opt/test_modeling_opt.py
+++ b/tests/models/opt/test_modeling_opt.py
@@ -70,7 +70,7 @@ class OPTModelTester:
         use_labels=False,
         vocab_size=99,
         hidden_size=16,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=4,
         hidden_act="gelu",
diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py
index 4dbd1fb0a8..8360b9f2a2 100644
--- a/tests/models/owlvit/test_modeling_owlvit.py
+++ b/tests/models/owlvit/test_modeling_owlvit.py
@@ -62,7 +62,7 @@ class OwlViTVisionModelTester:
         num_channels=3,
         is_training=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
diff --git a/tests/models/pegasus/test_modeling_flax_pegasus.py b/tests/models/pegasus/test_modeling_flax_pegasus.py
index fbc49c7811..62b9077f0d 100644
--- a/tests/models/pegasus/test_modeling_flax_pegasus.py
+++ b/tests/models/pegasus/test_modeling_flax_pegasus.py
@@ -52,7 +52,7 @@ class FlaxPegasusModelTester:
         use_labels=False,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_dropout_prob=0.1,
diff --git a/tests/models/pegasus/test_modeling_pegasus.py b/tests/models/pegasus/test_modeling_pegasus.py
index bde7477f94..4011fe2c68 100644
--- a/tests/models/pegasus/test_modeling_pegasus.py
+++ b/tests/models/pegasus/test_modeling_pegasus.py
@@ -371,7 +371,7 @@ class PegasusStandaloneDecoderModelTester:
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
diff --git a/tests/models/pegasus_x/test_modeling_pegasus_x.py b/tests/models/pegasus_x/test_modeling_pegasus_x.py
index 73c4ee62bf..22d7b0c863 100644
--- a/tests/models/pegasus_x/test_modeling_pegasus_x.py
+++ b/tests/models/pegasus_x/test_modeling_pegasus_x.py
@@ -670,7 +670,7 @@ class PegasusXStandaloneDecoderModelTester:
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py
index c49db3dcff..34ca767d6b 100644
--- a/tests/models/pix2struct/test_modeling_pix2struct.py
+++ b/tests/models/pix2struct/test_modeling_pix2struct.py
@@ -71,7 +71,7 @@ class Pix2StructVisionModelTester:
         patch_embed_hidden_size=12,
         projection_dim=32,
         max_patches=64,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -230,7 +230,7 @@ class Pix2StructTextModelTester:
         vocab_size=99,
         hidden_size=12,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
diff --git a/tests/models/plbart/test_modeling_plbart.py b/tests/models/plbart/test_modeling_plbart.py
index 05dbac6a2c..4cd8ecd14f 100644
--- a/tests/models/plbart/test_modeling_plbart.py
+++ b/tests/models/plbart/test_modeling_plbart.py
@@ -473,7 +473,7 @@ class PLBartStandaloneDecoderModelTester:
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
diff --git a/tests/models/prophetnet/test_modeling_prophetnet.py b/tests/models/prophetnet/test_modeling_prophetnet.py
index fa717b2743..eee03134d3 100644
--- a/tests/models/prophetnet/test_modeling_prophetnet.py
+++ b/tests/models/prophetnet/test_modeling_prophetnet.py
@@ -55,10 +55,10 @@ class ProphetNetModelTester:
         use_labels=True,
         decoder_start_token_id=0,
         encoder_ffn_dim=32,
-        num_encoder_layers=4,
+        num_encoder_layers=2,
         num_encoder_attention_heads=4,
         decoder_ffn_dim=32,
-        num_decoder_layers=4,
+        num_decoder_layers=2,
         num_decoder_attention_heads=4,
         max_position_embeddings=30,
         is_encoder_decoder=True,
@@ -437,10 +437,10 @@ class ProphetNetModelTester:
                 decoder_attention_mask=decoder_attention_mask,
                 labels=lm_labels,
             )
-        self.parent.assertTrue(torch.allclose(result.loss, torch.tensor(4.5981, device=torch_device), atol=1e-3))
+        self.parent.assertTrue(torch.allclose(result.loss, torch.tensor(4.5892, device=torch_device), atol=1e-3))
 
         expected_logit_slice = torch.tensor(
-            [-0.0648, 0.0790, 0.0360, 0.0089, 0.0039, -0.0639, 0.0131], device=torch_device
+            [-0.0184, 0.0758, -0.0543, -0.0093, 0.0050, -0.0660, -0.1453], device=torch_device
         )
         self.parent.assertTrue(torch.allclose(result.logits[0, :, 1], expected_logit_slice, atol=1e-3))
 
@@ -551,10 +551,10 @@ class ProphetNetStandaloneDecoderModelTester:
         use_labels=True,
         decoder_start_token_id=0,
         encoder_ffn_dim=32,
-        num_encoder_layers=4,
+        num_encoder_layers=2,
         num_encoder_attention_heads=4,
         decoder_ffn_dim=32,
-        num_decoder_layers=4,
+        num_decoder_layers=2,
         num_decoder_attention_heads=4,
         max_position_embeddings=30,
         is_encoder_decoder=False,
@@ -782,10 +782,10 @@ class ProphetNetStandaloneEncoderModelTester:
         use_labels=True,
         decoder_start_token_id=0,
         encoder_ffn_dim=32,
-        num_encoder_layers=4,
+        num_encoder_layers=2,
         num_encoder_attention_heads=4,
         decoder_ffn_dim=32,
-        num_decoder_layers=4,
+        num_decoder_layers=2,
         num_decoder_attention_heads=4,
         max_position_embeddings=30,
         is_encoder_decoder=False,
diff --git a/tests/models/qdqbert/test_modeling_qdqbert.py b/tests/models/qdqbert/test_modeling_qdqbert.py
index cc05389eea..d10abb733e 100644
--- a/tests/models/qdqbert/test_modeling_qdqbert.py
+++ b/tests/models/qdqbert/test_modeling_qdqbert.py
@@ -54,7 +54,7 @@ class QDQBertModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/realm/test_modeling_realm.py b/tests/models/realm/test_modeling_realm.py
index ddd6c26450..4d6d9fd0ff 100644
--- a/tests/models/realm/test_modeling_realm.py
+++ b/tests/models/realm/test_modeling_realm.py
@@ -54,7 +54,7 @@ class RealmModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/rembert/test_modeling_rembert.py b/tests/models/rembert/test_modeling_rembert.py
index 4e6754b2e5..557a42243d 100644
--- a/tests/models/rembert/test_modeling_rembert.py
+++ b/tests/models/rembert/test_modeling_rembert.py
@@ -55,7 +55,7 @@ class RemBertModelTester:
         hidden_size=32,
         input_embedding_size=18,
         output_embedding_size=43,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/roberta/test_modeling_flax_roberta.py b/tests/models/roberta/test_modeling_flax_roberta.py
index c325e295f6..f82479aa70 100644
--- a/tests/models/roberta/test_modeling_flax_roberta.py
+++ b/tests/models/roberta/test_modeling_flax_roberta.py
@@ -46,7 +46,7 @@ class FlaxRobertaModelTester(unittest.TestCase):
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/roberta/test_modeling_roberta.py b/tests/models/roberta/test_modeling_roberta.py
index 7ca78e23b7..40c85123c4 100644
--- a/tests/models/roberta/test_modeling_roberta.py
+++ b/tests/models/roberta/test_modeling_roberta.py
@@ -58,7 +58,7 @@ class RobertaModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py
index 3f15ca9ff3..8500dfcb67 100644
--- a/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py
+++ b/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py
@@ -49,7 +49,7 @@ class FlaxRobertaPreLayerNormModelTester(unittest.TestCase):
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py
index 4e4915147f..c44e1613b2 100644
--- a/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py
+++ b/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py
@@ -57,7 +57,7 @@ class RobertaPreLayerNormModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/roc_bert/test_modeling_roc_bert.py b/tests/models/roc_bert/test_modeling_roc_bert.py
index 2efd9b7992..d1caca6b6f 100644
--- a/tests/models/roc_bert/test_modeling_roc_bert.py
+++ b/tests/models/roc_bert/test_modeling_roc_bert.py
@@ -58,7 +58,7 @@ class RoCBertModelTester:
         pronunciation_embed_dim=32,
         shape_embed_dim=32,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/roformer/test_modeling_flax_roformer.py b/tests/models/roformer/test_modeling_flax_roformer.py
index 28d0ffba95..8364e121b4 100644
--- a/tests/models/roformer/test_modeling_flax_roformer.py
+++ b/tests/models/roformer/test_modeling_flax_roformer.py
@@ -47,7 +47,7 @@ class FlaxRoFormerModelTester(unittest.TestCase):
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/roformer/test_modeling_roformer.py b/tests/models/roformer/test_modeling_roformer.py
index 357e126a04..e54d31d154 100644
--- a/tests/models/roformer/test_modeling_roformer.py
+++ b/tests/models/roformer/test_modeling_roformer.py
@@ -56,7 +56,7 @@ class RoFormerModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/rwkv/test_modeling_rwkv.py b/tests/models/rwkv/test_modeling_rwkv.py
index 2b9cc47133..4ca5cfdf9e 100644
--- a/tests/models/rwkv/test_modeling_rwkv.py
+++ b/tests/models/rwkv/test_modeling_rwkv.py
@@ -52,7 +52,7 @@ class RwkvModelTester:
         use_mc_token_ids=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         intermediate_size=37,
         hidden_act="gelu",
         hidden_dropout_prob=0.1,
diff --git a/tests/models/sew/test_modeling_sew.py b/tests/models/sew/test_modeling_sew.py
index 651600c437..876b232a11 100644
--- a/tests/models/sew/test_modeling_sew.py
+++ b/tests/models/sew/test_modeling_sew.py
@@ -65,7 +65,7 @@ class SEWModelTester:
         num_conv_pos_embeddings=31,
         num_conv_pos_embedding_groups=2,
         squeeze_factor=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout=0.1,
         intermediate_size=20,
diff --git a/tests/models/sew_d/test_modeling_sew_d.py b/tests/models/sew_d/test_modeling_sew_d.py
index 9aa4b8edac..dc33e80ede 100644
--- a/tests/models/sew_d/test_modeling_sew_d.py
+++ b/tests/models/sew_d/test_modeling_sew_d.py
@@ -72,7 +72,7 @@ class SEWDModelTester:
         position_biased_input=False,
         pos_att_type=("p2c", "c2p"),
         norm_rel_ebd="layer_norm",
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout=0.1,
         intermediate_size=20,
diff --git a/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py b/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py
index ccd5bfa189..cbb449c6e7 100644
--- a/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py
+++ b/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py
@@ -50,7 +50,7 @@ class Speech2Text2StandaloneDecoderModelTester:
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         decoder_attention_heads=4,
         max_position_embeddings=30,
         pad_token_id=0,
diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py
index c357259d78..9324996ffe 100644
--- a/tests/models/speecht5/test_modeling_speecht5.py
+++ b/tests/models/speecht5/test_modeling_speecht5.py
@@ -105,7 +105,7 @@ class SpeechT5ModelTester:
         is_training=False,
         vocab_size=81,
         hidden_size=24,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         intermediate_size=4,
     ):
@@ -249,7 +249,7 @@ class SpeechT5ForSpeechToTextTester:
         decoder_seq_length=7,
         is_training=False,
         hidden_size=24,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         intermediate_size=4,
         conv_dim=(32, 32, 32),
@@ -786,7 +786,7 @@ class SpeechT5ForTextToSpeechTester:
         decoder_seq_length=1024,  # speech is longer
         is_training=False,
         hidden_size=24,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         intermediate_size=4,
         vocab_size=81,
@@ -1031,7 +1031,7 @@ class SpeechT5ForSpeechToSpeechTester:
         decoder_seq_length=1024,
         is_training=False,
         hidden_size=24,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         intermediate_size=4,
         conv_dim=(32, 32, 32),
diff --git a/tests/models/splinter/test_modeling_splinter.py b/tests/models/splinter/test_modeling_splinter.py
index 24a0753157..90ee07c354 100644
--- a/tests/models/splinter/test_modeling_splinter.py
+++ b/tests/models/splinter/test_modeling_splinter.py
@@ -46,7 +46,7 @@ class SplinterModelTester:
         vocab_size=99,
         hidden_size=32,
         question_token_id=1,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/squeezebert/test_modeling_squeezebert.py b/tests/models/squeezebert/test_modeling_squeezebert.py
index 5efb030311..bf86792f57 100644
--- a/tests/models/squeezebert/test_modeling_squeezebert.py
+++ b/tests/models/squeezebert/test_modeling_squeezebert.py
@@ -50,7 +50,7 @@ class SqueezeBertModelTester(object):
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=64,
         hidden_act="gelu",
diff --git a/tests/models/switch_transformers/test_modeling_switch_transformers.py b/tests/models/switch_transformers/test_modeling_switch_transformers.py
index abe785eca0..ae9966a1db 100644
--- a/tests/models/switch_transformers/test_modeling_switch_transformers.py
+++ b/tests/models/switch_transformers/test_modeling_switch_transformers.py
@@ -58,7 +58,7 @@ class SwitchTransformersModelTester:
         use_attention_mask=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
@@ -826,7 +826,7 @@ class SwitchTransformersEncoderOnlyModelTester:
         # For common tests
         use_attention_mask=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
diff --git a/tests/models/t5/test_modeling_flax_t5.py b/tests/models/t5/test_modeling_flax_t5.py
index a2a80ab25b..d5d729dac9 100644
--- a/tests/models/t5/test_modeling_flax_t5.py
+++ b/tests/models/t5/test_modeling_flax_t5.py
@@ -70,7 +70,7 @@ class FlaxT5ModelTester:
         use_attention_mask=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
@@ -477,7 +477,7 @@ class FlaxT5EncoderOnlyModelTester:
         use_attention_mask=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py
index 41aade960e..cae891ef8b 100644
--- a/tests/models/t5/test_modeling_t5.py
+++ b/tests/models/t5/test_modeling_t5.py
@@ -71,7 +71,7 @@ class T5ModelTester:
         use_attention_mask=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
@@ -902,7 +902,7 @@ class T5EncoderOnlyModelTester:
         # For common tests
         use_attention_mask=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
diff --git a/tests/models/tapas/test_modeling_tapas.py b/tests/models/tapas/test_modeling_tapas.py
index 619a5d2612..6a482d03be 100644
--- a/tests/models/tapas/test_modeling_tapas.py
+++ b/tests/models/tapas/test_modeling_tapas.py
@@ -79,7 +79,7 @@ class TapasModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/timesformer/test_modeling_timesformer.py b/tests/models/timesformer/test_modeling_timesformer.py
index 2783a65ced..2b7a5e279f 100644
--- a/tests/models/timesformer/test_modeling_timesformer.py
+++ b/tests/models/timesformer/test_modeling_timesformer.py
@@ -60,7 +60,7 @@ class TimesformerModelTester:
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/transfo_xl/test_modeling_transfo_xl.py b/tests/models/transfo_xl/test_modeling_transfo_xl.py
index 970f87bf10..63afd438d9 100644
--- a/tests/models/transfo_xl/test_modeling_transfo_xl.py
+++ b/tests/models/transfo_xl/test_modeling_transfo_xl.py
@@ -52,7 +52,7 @@ class TransfoXLModelTester:
         d_head=8,
         d_inner=128,
         div_val=2,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         scope=None,
         seed=1,
         eos_token_id=0,
diff --git a/tests/models/trocr/test_modeling_trocr.py b/tests/models/trocr/test_modeling_trocr.py
index 0033f339ae..da24c7dd43 100644
--- a/tests/models/trocr/test_modeling_trocr.py
+++ b/tests/models/trocr/test_modeling_trocr.py
@@ -47,7 +47,7 @@ class TrOCRStandaloneDecoderModelTester:
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         decoder_attention_heads=4,
         max_position_embeddings=30,
         pad_token_id=0,
diff --git a/tests/models/tvlt/test_modeling_tvlt.py b/tests/models/tvlt/test_modeling_tvlt.py
index e437b2651e..3ee7f7adc7 100644
--- a/tests/models/tvlt/test_modeling_tvlt.py
+++ b/tests/models/tvlt/test_modeling_tvlt.py
@@ -68,7 +68,7 @@ class TvltModelTester:
         num_audio_channels=1,
         num_frames=2,
         hidden_size=32,
-        num_hidden_layers=3,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=128,
         hidden_act="gelu",
diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py
index 29f8502179..d9fd852c88 100644
--- a/tests/models/umt5/test_modeling_umt5.py
+++ b/tests/models/umt5/test_modeling_umt5.py
@@ -64,7 +64,7 @@ class UMT5ModelTester:
         use_attention_mask=True,
         use_labels=False,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
diff --git a/tests/models/unispeech/test_modeling_unispeech.py b/tests/models/unispeech/test_modeling_unispeech.py
index 6d0bd1bf1f..ac770bdbb6 100644
--- a/tests/models/unispeech/test_modeling_unispeech.py
+++ b/tests/models/unispeech/test_modeling_unispeech.py
@@ -65,7 +65,7 @@ class UniSpeechModelTester:
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
         intermediate_size=20,
diff --git a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
index a418a56dad..9c8cffba9a 100644
--- a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
+++ b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
@@ -67,7 +67,7 @@ class UniSpeechSatModelTester:
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
         intermediate_size=20,
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index 85a0d2714e..9fb9c9e7f3 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -62,7 +62,7 @@ class VideoMAEModelTester:
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/vilt/test_modeling_vilt.py b/tests/models/vilt/test_modeling_vilt.py
index 772091d5b9..4aa036ebb6 100644
--- a/tests/models/vilt/test_modeling_vilt.py
+++ b/tests/models/vilt/test_modeling_vilt.py
@@ -65,7 +65,7 @@ class ViltModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/visual_bert/test_modeling_visual_bert.py b/tests/models/visual_bert/test_modeling_visual_bert.py
index cf48fd7ffb..9000be33ab 100644
--- a/tests/models/visual_bert/test_modeling_visual_bert.py
+++ b/tests/models/visual_bert/test_modeling_visual_bert.py
@@ -54,7 +54,7 @@ class VisualBertModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/vit/test_modeling_flax_vit.py b/tests/models/vit/test_modeling_flax_vit.py
index ca3130493e..af56f4717b 100644
--- a/tests/models/vit/test_modeling_flax_vit.py
+++ b/tests/models/vit/test_modeling_flax_vit.py
@@ -41,7 +41,7 @@ class FlaxViTModelTester(unittest.TestCase):
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/vit/test_modeling_vit.py b/tests/models/vit/test_modeling_vit.py
index 67c6e4acb1..82ba910ec8 100644
--- a/tests/models/vit/test_modeling_vit.py
+++ b/tests/models/vit/test_modeling_vit.py
@@ -59,7 +59,7 @@ class ViTModelTester:
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
index fc816750e7..20747b2d54 100644
--- a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
+++ b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
@@ -50,7 +50,7 @@ class ViTHybridModelTester:
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py
index 3cedb0c176..89a3a0d803 100644
--- a/tests/models/vit_mae/test_modeling_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_vit_mae.py
@@ -56,7 +56,7 @@ class ViTMAEModelTester:
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/vit_msn/test_modeling_vit_msn.py b/tests/models/vit_msn/test_modeling_vit_msn.py
index 173dca0913..a531637751 100644
--- a/tests/models/vit_msn/test_modeling_vit_msn.py
+++ b/tests/models/vit_msn/test_modeling_vit_msn.py
@@ -52,7 +52,7 @@ class ViTMSNModelTester:
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py
index b9b52dc121..4cff7dca41 100644
--- a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py
@@ -123,7 +123,7 @@ class FlaxWav2Vec2ModelTester:
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
         intermediate_size=20,
diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py
index 630a5d8e85..fb639077b6 100644
--- a/tests/models/wav2vec2/test_modeling_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py
@@ -153,7 +153,7 @@ class Wav2Vec2ModelTester:
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
         intermediate_size=20,
diff --git a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
index 8c26268c6d..a79e8ac1ea 100644
--- a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
+++ b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
@@ -71,7 +71,7 @@ class Wav2Vec2ConformerModelTester:
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,
         intermediate_size=20,
diff --git a/tests/models/wavlm/test_modeling_wavlm.py b/tests/models/wavlm/test_modeling_wavlm.py
index b04a96dd1c..05385b68b0 100644
--- a/tests/models/wavlm/test_modeling_wavlm.py
+++ b/tests/models/wavlm/test_modeling_wavlm.py
@@ -64,7 +64,7 @@ class WavLMModelTester:
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
         intermediate_size=20,
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index ef2d11ac62..5c602d3d3e 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -61,7 +61,7 @@ class XCLIPVisionModelTester:
         num_frames=8,  # important; the batch size * time must be divisible by the number of frames
         is_training=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         mit_hidden_size=64,
@@ -318,7 +318,7 @@ class XCLIPTextModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
diff --git a/tests/models/xglm/test_modeling_flax_xglm.py b/tests/models/xglm/test_modeling_flax_xglm.py
index 60436cb1f9..8f1c9a5e2a 100644
--- a/tests/models/xglm/test_modeling_flax_xglm.py
+++ b/tests/models/xglm/test_modeling_flax_xglm.py
@@ -53,7 +53,7 @@ class FlaxXGLMModelTester:
         use_labels=True,
         vocab_size=99,
         d_model=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         ffn_dim=37,
         activation_function="gelu",
diff --git a/tests/models/xglm/test_modeling_xglm.py b/tests/models/xglm/test_modeling_xglm.py
index bbb87abe6d..e6c013cca1 100644
--- a/tests/models/xglm/test_modeling_xglm.py
+++ b/tests/models/xglm/test_modeling_xglm.py
@@ -44,7 +44,7 @@ class XGLMModelTester:
         use_labels=True,
         vocab_size=99,
         d_model=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         ffn_dim=37,
         activation_function="gelu",
diff --git a/tests/models/xlm/test_modeling_xlm.py b/tests/models/xlm/test_modeling_xlm.py
index d8a1844114..b551e7e645 100644
--- a/tests/models/xlm/test_modeling_xlm.py
+++ b/tests/models/xlm/test_modeling_xlm.py
@@ -57,7 +57,7 @@ class XLMModelTester:
         vocab_size=99,
         n_special=0,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
diff --git a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py
index 7ec84c9b1f..828d6a02a6 100644
--- a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py
+++ b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py
@@ -55,7 +55,7 @@ class XLMRobertaXLModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/xlnet/test_modeling_xlnet.py b/tests/models/xlnet/test_modeling_xlnet.py
index 2b3f4752ee..2b0c95cd6d 100644
--- a/tests/models/xlnet/test_modeling_xlnet.py
+++ b/tests/models/xlnet/test_modeling_xlnet.py
@@ -56,7 +56,7 @@ class XLNetModelTester:
         hidden_size=32,
         num_attention_heads=4,
         d_inner=128,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         type_sequence_label_size=2,
         untie_r=True,
         bi_data=False,
diff --git a/tests/models/xmod/test_modeling_xmod.py b/tests/models/xmod/test_modeling_xmod.py
index 5845dee744..fc1ce44e35 100644
--- a/tests/models/xmod/test_modeling_xmod.py
+++ b/tests/models/xmod/test_modeling_xmod.py
@@ -51,7 +51,7 @@ class XmodModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/yolos/test_modeling_yolos.py b/tests/models/yolos/test_modeling_yolos.py
index 7c49bb864e..c1fb50e30b 100644
--- a/tests/models/yolos/test_modeling_yolos.py
+++ b/tests/models/yolos/test_modeling_yolos.py
@@ -52,7 +52,7 @@ class YolosModelTester:
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/yoso/test_modeling_yoso.py b/tests/models/yoso/test_modeling_yoso.py
index e275e19e40..67d7b9edc4 100644
--- a/tests/models/yoso/test_modeling_yoso.py
+++ b/tests/models/yoso/test_modeling_yoso.py
@@ -51,7 +51,7 @@ class YosoModelTester:
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index d80624b718..0cdc94fc8e 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -1017,7 +1017,8 @@ class ModelTesterMixin:
             attentions = outputs[-1]
 
             self.assertEqual(attentions[0].shape[-3], 1)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+            # TODO: To have this check, we will need at least 3 layers. Do we really need it?
+            # self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
             self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
 
     def test_head_pruning_save_load_from_pretrained(self):
@@ -1053,7 +1054,8 @@ class ModelTesterMixin:
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
             attentions = outputs[-1]
             self.assertEqual(attentions[0].shape[-3], 1)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+            # TODO: To have this check, we will need at least 3 layers. Do we really need it?
+            # self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
             self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
 
     def test_head_pruning_save_load_from_config_init(self):
@@ -1087,7 +1089,8 @@ class ModelTesterMixin:
             attentions = outputs[-1]
 
             self.assertEqual(attentions[0].shape[-3], 1)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+            # TODO: To have this check, we will need at least 3 layers. Do we really need it?
+            # self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
             self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
 
     def test_head_pruning_integration(self):
@@ -1106,7 +1109,7 @@ class ModelTesterMixin:
             inputs_dict["output_attentions"] = True
             config.output_hidden_states = False
 
-            heads_to_prune = {0: [0], 1: [1, 2]}
+            heads_to_prune = {1: [1, 2]}
             config.pruned_heads = heads_to_prune
 
             model = model_class(config=config)
@@ -1117,10 +1120,8 @@ class ModelTesterMixin:
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
             attentions = outputs[-1]
 
-            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
+            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 0)
             self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
 
             with tempfile.TemporaryDirectory() as temp_dir_name:
                 model.save_pretrained(temp_dir_name)
@@ -1131,12 +1132,10 @@ class ModelTesterMixin:
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
             attentions = outputs[-1]
 
-            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
+            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 0)
             self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
 
-            heads_to_prune = {0: [0], 2: [1, 2]}
+            heads_to_prune = {0: [0], 1: [1, 2]}
             model.prune_heads(heads_to_prune)
 
             with torch.no_grad():
@@ -1145,10 +1144,8 @@ class ModelTesterMixin:
 
             self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
             self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2)
-            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
 
-            self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
+            self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2]})
 
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):