From bd90cda9a6bb4723515c17df1192e53abc8e36e3 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Wed, 2 Aug 2023 20:22:36 +0200 Subject: [PATCH] =?UTF-8?q?CI=20with=20`num=5Fhidden=5Flayers=3D2`=20?= =?UTF-8?q?=F0=9F=9A=80=F0=9F=9A=80=F0=9F=9A=80=20(#25266)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * CI with layers=2 --------- Co-authored-by: ydshieh --- tests/models/albert/test_modeling_albert.py | 5 ++-- .../albert/test_modeling_flax_albert.py | 2 +- tests/models/align/test_modeling_align.py | 2 +- tests/models/altclip/test_modeling_altclip.py | 4 +-- ..._modeling_audio_spectrogram_transformer.py | 2 +- tests/models/bart/test_modeling_bart.py | 2 +- tests/models/beit/test_modeling_beit.py | 2 +- tests/models/beit/test_modeling_flax_beit.py | 2 +- tests/models/bert/test_modeling_bert.py | 2 +- tests/models/bert/test_modeling_flax_bert.py | 2 +- .../test_modeling_bert_generation.py | 2 +- .../test_modeling_bigbird_pegasus.py | 2 +- tests/models/biogpt/test_modeling_biogpt.py | 2 +- .../blenderbot/test_modeling_blenderbot.py | 2 +- .../test_modeling_blenderbot_small.py | 2 +- tests/models/blip/test_modeling_blip.py | 4 +-- tests/models/blip/test_modeling_blip_text.py | 2 +- tests/models/blip_2/test_modeling_blip_2.py | 8 +++--- tests/models/bloom/test_modeling_bloom.py | 2 +- tests/models/canine/test_modeling_canine.py | 2 +- .../test_modeling_chinese_clip.py | 4 +-- tests/models/clap/test_modeling_clap.py | 2 +- tests/models/clip/test_modeling_clip.py | 4 +-- tests/models/clip/test_modeling_flax_clip.py | 4 +-- tests/models/clipseg/test_modeling_clipseg.py | 17 ++++++++++--- tests/models/codegen/test_modeling_codegen.py | 2 +- .../models/convbert/test_modeling_convbert.py | 2 +- tests/models/cpmant/test_modeling_cpmant.py | 2 +- tests/models/ctrl/test_modeling_ctrl.py | 2 +- .../data2vec/test_modeling_data2vec_audio.py | 2 +- .../data2vec/test_modeling_data2vec_text.py | 2 +- .../data2vec/test_modeling_data2vec_vision.py | 2 +- tests/models/deberta/test_modeling_deberta.py | 2 +- .../deberta_v2/test_modeling_deberta_v2.py | 2 +- tests/models/deit/test_modeling_deit.py | 2 +- tests/models/dinov2/test_modeling_dinov2.py | 2 +- .../distilbert/test_modeling_distilbert.py | 2 +- .../test_modeling_flax_distilbert.py | 2 +- tests/models/dpr/test_modeling_dpr.py | 2 +- tests/models/dpt/test_modeling_dpt.py | 4 +-- tests/models/electra/test_modeling_electra.py | 2 +- .../electra/test_modeling_flax_electra.py | 2 +- tests/models/ernie/test_modeling_ernie.py | 2 +- tests/models/ernie_m/test_modeling_ernie_m.py | 2 +- tests/models/esm/test_modeling_esm.py | 2 +- tests/models/esm/test_modeling_esmfold.py | 2 +- tests/models/falcon/test_modeling_falcon.py | 2 +- .../models/flaubert/test_modeling_flaubert.py | 2 +- tests/models/flava/test_modeling_flava.py | 6 ++--- tests/models/fnet/test_modeling_fnet.py | 2 +- tests/models/git/test_modeling_git.py | 4 +-- tests/models/gpt2/test_modeling_flax_gpt2.py | 2 +- tests/models/gpt2/test_modeling_gpt2.py | 2 +- .../gpt_bigcode/test_modeling_gpt_bigcode.py | 2 +- .../gpt_neo/test_modeling_flax_gpt_neo.py | 4 +-- tests/models/gpt_neo/test_modeling_gpt_neo.py | 4 +-- .../models/gpt_neox/test_modeling_gpt_neox.py | 2 +- .../test_modeling_gpt_neox_japanese.py | 2 +- tests/models/gptj/test_modeling_flax_gptj.py | 2 +- tests/models/gptj/test_modeling_gptj.py | 2 +- .../test_modeling_gptsan_japanese.py | 2 +- .../models/groupvit/test_modeling_groupvit.py | 6 ++++- tests/models/hubert/test_modeling_hubert.py | 2 +- tests/models/ibert/test_modeling_ibert.py | 2 +- .../models/imagegpt/test_modeling_imagegpt.py | 2 +- .../test_modeling_instructblip.py | 6 ++--- .../models/layoutlm/test_modeling_layoutlm.py | 2 +- .../layoutlmv2/test_modeling_layoutlmv2.py | 2 +- .../layoutlmv3/test_modeling_layoutlmv3.py | 2 +- tests/models/llama/test_modeling_llama.py | 2 +- .../longformer/test_modeling_longformer.py | 2 +- .../longt5/test_modeling_flax_longt5.py | 2 +- tests/models/longt5/test_modeling_longt5.py | 4 +-- tests/models/luke/test_modeling_luke.py | 2 +- tests/models/marian/test_modeling_marian.py | 2 +- .../models/markuplm/test_modeling_markuplm.py | 2 +- tests/models/mbart/test_modeling_mbart.py | 2 +- tests/models/mega/test_modeling_mega.py | 2 +- .../test_modeling_megatron_bert.py | 2 +- tests/models/mgp_str/test_modeling_mgp_str.py | 2 +- .../mobilebert/test_modeling_mobilebert.py | 2 +- tests/models/mpnet/test_modeling_mpnet.py | 2 +- tests/models/mpt/test_modeling_mpt.py | 2 +- tests/models/mra/test_modeling_mra.py | 2 +- tests/models/mvp/test_modeling_mvp.py | 2 +- tests/models/nezha/test_modeling_nezha.py | 2 +- .../models/nllb_moe/test_modeling_nllb_moe.py | 2 +- .../test_modeling_nystromformer.py | 2 +- tests/models/openai/test_modeling_openai.py | 2 +- tests/models/opt/test_modeling_opt.py | 2 +- tests/models/owlvit/test_modeling_owlvit.py | 2 +- .../pegasus/test_modeling_flax_pegasus.py | 2 +- tests/models/pegasus/test_modeling_pegasus.py | 2 +- .../pegasus_x/test_modeling_pegasus_x.py | 2 +- .../pix2struct/test_modeling_pix2struct.py | 4 +-- tests/models/plbart/test_modeling_plbart.py | 2 +- .../prophetnet/test_modeling_prophetnet.py | 16 ++++++------ tests/models/qdqbert/test_modeling_qdqbert.py | 2 +- tests/models/realm/test_modeling_realm.py | 2 +- tests/models/rembert/test_modeling_rembert.py | 2 +- .../roberta/test_modeling_flax_roberta.py | 2 +- tests/models/roberta/test_modeling_roberta.py | 2 +- ...test_modeling_flax_roberta_prelayernorm.py | 2 +- .../test_modeling_roberta_prelayernorm.py | 2 +- .../models/roc_bert/test_modeling_roc_bert.py | 2 +- .../roformer/test_modeling_flax_roformer.py | 2 +- .../models/roformer/test_modeling_roformer.py | 2 +- tests/models/rwkv/test_modeling_rwkv.py | 2 +- tests/models/sew/test_modeling_sew.py | 2 +- tests/models/sew_d/test_modeling_sew_d.py | 2 +- .../test_modeling_speech_to_text_2.py | 2 +- .../models/speecht5/test_modeling_speecht5.py | 8 +++--- .../models/splinter/test_modeling_splinter.py | 2 +- .../squeezebert/test_modeling_squeezebert.py | 2 +- .../test_modeling_switch_transformers.py | 4 +-- tests/models/t5/test_modeling_flax_t5.py | 4 +-- tests/models/t5/test_modeling_t5.py | 4 +-- tests/models/tapas/test_modeling_tapas.py | 2 +- .../timesformer/test_modeling_timesformer.py | 2 +- .../transfo_xl/test_modeling_transfo_xl.py | 2 +- tests/models/trocr/test_modeling_trocr.py | 2 +- tests/models/tvlt/test_modeling_tvlt.py | 2 +- tests/models/umt5/test_modeling_umt5.py | 2 +- .../unispeech/test_modeling_unispeech.py | 2 +- .../test_modeling_unispeech_sat.py | 2 +- .../models/videomae/test_modeling_videomae.py | 2 +- tests/models/vilt/test_modeling_vilt.py | 2 +- .../visual_bert/test_modeling_visual_bert.py | 2 +- tests/models/vit/test_modeling_flax_vit.py | 2 +- tests/models/vit/test_modeling_vit.py | 2 +- .../vit_hybrid/test_modeling_vit_hybrid.py | 2 +- tests/models/vit_mae/test_modeling_vit_mae.py | 2 +- tests/models/vit_msn/test_modeling_vit_msn.py | 2 +- .../wav2vec2/test_modeling_flax_wav2vec2.py | 2 +- .../models/wav2vec2/test_modeling_wav2vec2.py | 2 +- .../test_modeling_wav2vec2_conformer.py | 2 +- tests/models/wavlm/test_modeling_wavlm.py | 2 +- tests/models/x_clip/test_modeling_x_clip.py | 4 +-- tests/models/xglm/test_modeling_flax_xglm.py | 2 +- tests/models/xglm/test_modeling_xglm.py | 2 +- tests/models/xlm/test_modeling_xlm.py | 2 +- .../test_modeling_xlm_roberta_xl.py | 2 +- tests/models/xlnet/test_modeling_xlnet.py | 2 +- tests/models/xmod/test_modeling_xmod.py | 2 +- tests/models/yolos/test_modeling_yolos.py | 2 +- tests/models/yoso/test_modeling_yoso.py | 2 +- tests/test_modeling_common.py | 25 ++++++++----------- 147 files changed, 207 insertions(+), 196 deletions(-) diff --git a/tests/models/albert/test_modeling_albert.py b/tests/models/albert/test_modeling_albert.py index 96fa5596a2..75c84ad0d3 100644 --- a/tests/models/albert/test_modeling_albert.py +++ b/tests/models/albert/test_modeling_albert.py @@ -54,8 +54,9 @@ class AlbertModelTester: vocab_size=99, embedding_size=16, hidden_size=36, - num_hidden_layers=6, - num_hidden_groups=6, + num_hidden_layers=2, + # this needs to be the same as `num_hidden_layers`! + num_hidden_groups=2, num_attention_heads=6, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/albert/test_modeling_flax_albert.py b/tests/models/albert/test_modeling_flax_albert.py index 5292665f55..0bdc8065bc 100644 --- a/tests/models/albert/test_modeling_flax_albert.py +++ b/tests/models/albert/test_modeling_flax_albert.py @@ -48,7 +48,7 @@ class FlaxAlbertModelTester(unittest.TestCase): use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py index 35b2aebf6c..47918bcd83 100644 --- a/tests/models/align/test_modeling_align.py +++ b/tests/models/align/test_modeling_align.py @@ -242,7 +242,7 @@ class AlignTextModelTester: use_token_type_ids=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py index fdae7768da..244ef1ed3b 100755 --- a/tests/models/altclip/test_modeling_altclip.py +++ b/tests/models/altclip/test_modeling_altclip.py @@ -60,7 +60,7 @@ class AltCLIPVisionModelTester: is_training=True, hidden_size=32, projection_dim=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, @@ -212,7 +212,7 @@ class AltCLIPTextModelTester: hidden_size=32, projection_dim=32, project_dim=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, diff --git a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py index c36e946f19..ce596d84e3 100644 --- a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py +++ b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py @@ -55,7 +55,7 @@ class ASTModelTester: is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/bart/test_modeling_bart.py b/tests/models/bart/test_modeling_bart.py index 949e647e6f..01189e5628 100644 --- a/tests/models/bart/test_modeling_bart.py +++ b/tests/models/bart/test_modeling_bart.py @@ -1289,7 +1289,7 @@ class BartStandaloneDecoderModelTester: use_labels=True, decoder_start_token_id=2, decoder_ffn_dim=32, - decoder_layers=4, + decoder_layers=2, encoder_attention_heads=4, decoder_attention_heads=4, max_position_embeddings=30, diff --git a/tests/models/beit/test_modeling_beit.py b/tests/models/beit/test_modeling_beit.py index 4c0e5e5a45..2a35cddf40 100644 --- a/tests/models/beit/test_modeling_beit.py +++ b/tests/models/beit/test_modeling_beit.py @@ -64,7 +64,7 @@ class BeitModelTester: is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/beit/test_modeling_flax_beit.py b/tests/models/beit/test_modeling_flax_beit.py index 0587174bac..78c24220c2 100644 --- a/tests/models/beit/test_modeling_flax_beit.py +++ b/tests/models/beit/test_modeling_flax_beit.py @@ -48,7 +48,7 @@ class FlaxBeitModelTester(unittest.TestCase): is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py index 52c8035d8a..9aec91367d 100644 --- a/tests/models/bert/test_modeling_bert.py +++ b/tests/models/bert/test_modeling_bert.py @@ -57,7 +57,7 @@ class BertModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/bert/test_modeling_flax_bert.py b/tests/models/bert/test_modeling_flax_bert.py index 55ffb44019..8226899175 100644 --- a/tests/models/bert/test_modeling_flax_bert.py +++ b/tests/models/bert/test_modeling_flax_bert.py @@ -47,7 +47,7 @@ class FlaxBertModelTester(unittest.TestCase): use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/bert_generation/test_modeling_bert_generation.py b/tests/models/bert_generation/test_modeling_bert_generation.py index ced98e6f72..ecd7a459e0 100644 --- a/tests/models/bert_generation/test_modeling_bert_generation.py +++ b/tests/models/bert_generation/test_modeling_bert_generation.py @@ -41,7 +41,7 @@ class BertGenerationEncoderTester: use_input_mask=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py index 5d345db3fc..aedbbb4634 100644 --- a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py +++ b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py @@ -605,7 +605,7 @@ class BigBirdPegasusStandaloneDecoderModelTester: use_labels=True, decoder_start_token_id=2, decoder_ffn_dim=32, - decoder_layers=4, + decoder_layers=2, encoder_attention_heads=4, decoder_attention_heads=4, max_position_embeddings=30, diff --git a/tests/models/biogpt/test_modeling_biogpt.py b/tests/models/biogpt/test_modeling_biogpt.py index e504109308..e43fc1e41b 100644 --- a/tests/models/biogpt/test_modeling_biogpt.py +++ b/tests/models/biogpt/test_modeling_biogpt.py @@ -51,7 +51,7 @@ class BioGptModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/blenderbot/test_modeling_blenderbot.py b/tests/models/blenderbot/test_modeling_blenderbot.py index 499c7aa521..ca1630b3cf 100644 --- a/tests/models/blenderbot/test_modeling_blenderbot.py +++ b/tests/models/blenderbot/test_modeling_blenderbot.py @@ -356,7 +356,7 @@ class BlenderbotStandaloneDecoderModelTester: use_labels=True, decoder_start_token_id=2, decoder_ffn_dim=32, - decoder_layers=4, + decoder_layers=2, encoder_attention_heads=4, decoder_attention_heads=4, max_position_embeddings=30, diff --git a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py index 257aa1699c..249a8a799a 100644 --- a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py +++ b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py @@ -365,7 +365,7 @@ class BlenderbotSmallStandaloneDecoderModelTester: use_labels=True, decoder_start_token_id=2, decoder_ffn_dim=32, - decoder_layers=4, + decoder_layers=2, encoder_attention_heads=4, decoder_attention_heads=4, max_position_embeddings=30, diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py index 9a6e3da06c..cf8c487082 100644 --- a/tests/models/blip/test_modeling_blip.py +++ b/tests/models/blip/test_modeling_blip.py @@ -70,7 +70,7 @@ class BlipVisionModelTester: is_training=True, hidden_size=32, projection_dim=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, @@ -221,7 +221,7 @@ class BlipTextModelTester: vocab_size=99, hidden_size=32, projection_dim=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, diff --git a/tests/models/blip/test_modeling_blip_text.py b/tests/models/blip/test_modeling_blip_text.py index 488512f6e2..2301b776fe 100644 --- a/tests/models/blip/test_modeling_blip_text.py +++ b/tests/models/blip/test_modeling_blip_text.py @@ -44,7 +44,7 @@ class BlipTextModelTester: vocab_size=99, hidden_size=32, projection_dim=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index 71f652050f..c5bdb70791 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -62,7 +62,7 @@ class Blip2VisionModelTester: is_training=True, hidden_size=32, projection_dim=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, @@ -215,7 +215,7 @@ class Blip2QFormerModelTester: vocab_size=99, hidden_size=32, projection_dim=32, - num_hidden_layers=6, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, @@ -289,7 +289,7 @@ class Blip2TextModelDecoderOnlyTester: use_labels=False, vocab_size=99, hidden_size=16, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=4, hidden_act="gelu", @@ -503,7 +503,7 @@ class Blip2TextModelTester: use_attention_mask=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, d_ff=37, relative_attention_num_buckets=8, diff --git a/tests/models/bloom/test_modeling_bloom.py b/tests/models/bloom/test_modeling_bloom.py index 4e9b837c8a..de7cb03e7e 100644 --- a/tests/models/bloom/test_modeling_bloom.py +++ b/tests/models/bloom/test_modeling_bloom.py @@ -54,7 +54,7 @@ class BloomModelTester: use_mc_token_ids=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/canine/test_modeling_canine.py b/tests/models/canine/test_modeling_canine.py index 057fc09131..303d465ca9 100644 --- a/tests/models/canine/test_modeling_canine.py +++ b/tests/models/canine/test_modeling_canine.py @@ -53,7 +53,7 @@ class CanineModelTester: # NOTE: this is not a model parameter, just an input vocab_size=100000, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py index 894f1e1279..137c3c2888 100644 --- a/tests/models/chinese_clip/test_modeling_chinese_clip.py +++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py @@ -69,7 +69,7 @@ class ChineseCLIPTextModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", @@ -246,7 +246,7 @@ class ChineseCLIPVisionModelTester: is_training=True, hidden_size=32, projection_dim=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py index 35c0f4e203..dc5718850f 100644 --- a/tests/models/clap/test_modeling_clap.py +++ b/tests/models/clap/test_modeling_clap.py @@ -287,7 +287,7 @@ class ClapTextModelTester: vocab_size=99, hidden_size=32, projection_dim=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py index 4239ce5ed0..996bea95b9 100644 --- a/tests/models/clip/test_modeling_clip.py +++ b/tests/models/clip/test_modeling_clip.py @@ -86,7 +86,7 @@ class CLIPVisionModelTester: is_training=True, hidden_size=32, projection_dim=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, @@ -261,7 +261,7 @@ class CLIPTextModelTester: vocab_size=99, hidden_size=32, projection_dim=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, diff --git a/tests/models/clip/test_modeling_flax_clip.py b/tests/models/clip/test_modeling_flax_clip.py index 7d63fa9edf..565c641aef 100644 --- a/tests/models/clip/test_modeling_flax_clip.py +++ b/tests/models/clip/test_modeling_flax_clip.py @@ -35,7 +35,7 @@ class FlaxCLIPVisionModelTester: num_channels=3, is_training=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, @@ -252,7 +252,7 @@ class FlaxCLIPTextModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index e931bdc8d5..37a71d1b18 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -78,7 +78,7 @@ class CLIPSegVisionModelTester: num_channels=3, is_training=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, @@ -228,7 +228,7 @@ class CLIPSegTextModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, @@ -346,7 +346,15 @@ class CLIPSegTextModelTest(ModelTesterMixin, unittest.TestCase): class CLIPSegModelTester: - def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True): + def __init__( + self, + parent, + text_kwargs=None, + vision_kwargs=None, + is_training=True, + # This should respect the `num_hidden_layers` in `CLIPSegVisionModelTester` + extract_layers=(1,), + ): if text_kwargs is None: text_kwargs = {} if vision_kwargs is None: @@ -356,6 +364,7 @@ class CLIPSegModelTester: self.text_model_tester = CLIPSegTextModelTester(parent, **text_kwargs) self.vision_model_tester = CLIPSegVisionModelTester(parent, **vision_kwargs) self.is_training = is_training + self.extract_layers = extract_layers def prepare_config_and_inputs(self): text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() @@ -371,7 +380,7 @@ class CLIPSegModelTester: self.vision_model_tester.get_config(), projection_dim=64, reduce_dim=32, - extract_layers=[1, 2, 3], + extract_layers=self.extract_layers, ) def create_and_check_model(self, config, input_ids, attention_mask, pixel_values): diff --git a/tests/models/codegen/test_modeling_codegen.py b/tests/models/codegen/test_modeling_codegen.py index 9072c2b5bc..34a32caa7f 100644 --- a/tests/models/codegen/test_modeling_codegen.py +++ b/tests/models/codegen/test_modeling_codegen.py @@ -47,7 +47,7 @@ class CodeGenModelTester: vocab_size=256, hidden_size=32, rotary_dim=4, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/convbert/test_modeling_convbert.py b/tests/models/convbert/test_modeling_convbert.py index dc1550acc2..754967ce00 100644 --- a/tests/models/convbert/test_modeling_convbert.py +++ b/tests/models/convbert/test_modeling_convbert.py @@ -53,7 +53,7 @@ class ConvBertModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/cpmant/test_modeling_cpmant.py b/tests/models/cpmant/test_modeling_cpmant.py index f3ebeaad3e..6ecfe15c2e 100644 --- a/tests/models/cpmant/test_modeling_cpmant.py +++ b/tests/models/cpmant/test_modeling_cpmant.py @@ -49,7 +49,7 @@ class CpmAntModelTester: use_mc_token_ids=False, vocab_size=99, hidden_size=32, - num_hidden_layers=3, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, num_buckets=32, diff --git a/tests/models/ctrl/test_modeling_ctrl.py b/tests/models/ctrl/test_modeling_ctrl.py index dfcb2c9133..ff42744415 100644 --- a/tests/models/ctrl/test_modeling_ctrl.py +++ b/tests/models/ctrl/test_modeling_ctrl.py @@ -49,7 +49,7 @@ class CTRLModelTester: use_mc_token_ids=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/data2vec/test_modeling_data2vec_audio.py b/tests/models/data2vec/test_modeling_data2vec_audio.py index 67fe0cbe70..e9448621e9 100644 --- a/tests/models/data2vec/test_modeling_data2vec_audio.py +++ b/tests/models/data2vec/test_modeling_data2vec_audio.py @@ -59,7 +59,7 @@ class Data2VecAudioModelTester: conv_bias=False, num_conv_pos_embeddings=16, num_conv_pos_embedding_groups=2, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, hidden_dropout_prob=0.1, intermediate_size=20, diff --git a/tests/models/data2vec/test_modeling_data2vec_text.py b/tests/models/data2vec/test_modeling_data2vec_text.py index a45c9b6a8b..4b4b2835dc 100644 --- a/tests/models/data2vec/test_modeling_data2vec_text.py +++ b/tests/models/data2vec/test_modeling_data2vec_text.py @@ -57,7 +57,7 @@ class Data2VecTextModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/data2vec/test_modeling_data2vec_vision.py b/tests/models/data2vec/test_modeling_data2vec_vision.py index 299ffad3e7..69a763a4f2 100644 --- a/tests/models/data2vec/test_modeling_data2vec_vision.py +++ b/tests/models/data2vec/test_modeling_data2vec_vision.py @@ -59,7 +59,7 @@ class Data2VecVisionModelTester: is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/deberta/test_modeling_deberta.py b/tests/models/deberta/test_modeling_deberta.py index 7daff3b522..52758e2222 100644 --- a/tests/models/deberta/test_modeling_deberta.py +++ b/tests/models/deberta/test_modeling_deberta.py @@ -47,7 +47,7 @@ class DebertaModelTester(object): use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/deberta_v2/test_modeling_deberta_v2.py b/tests/models/deberta_v2/test_modeling_deberta_v2.py index 548c9617b8..abfbe7402c 100644 --- a/tests/models/deberta_v2/test_modeling_deberta_v2.py +++ b/tests/models/deberta_v2/test_modeling_deberta_v2.py @@ -48,7 +48,7 @@ class DebertaV2ModelTester(object): use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/deit/test_modeling_deit.py b/tests/models/deit/test_modeling_deit.py index 37bfe3fa70..2685900afb 100644 --- a/tests/models/deit/test_modeling_deit.py +++ b/tests/models/deit/test_modeling_deit.py @@ -69,7 +69,7 @@ class DeiTModelTester: is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/dinov2/test_modeling_dinov2.py b/tests/models/dinov2/test_modeling_dinov2.py index ed69faa444..cf7ff95b57 100644 --- a/tests/models/dinov2/test_modeling_dinov2.py +++ b/tests/models/dinov2/test_modeling_dinov2.py @@ -57,7 +57,7 @@ class Dinov2ModelTester: is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/distilbert/test_modeling_distilbert.py b/tests/models/distilbert/test_modeling_distilbert.py index 9d17a1c441..ff56afd0a9 100644 --- a/tests/models/distilbert/test_modeling_distilbert.py +++ b/tests/models/distilbert/test_modeling_distilbert.py @@ -50,7 +50,7 @@ class DistilBertModelTester(object): use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/distilbert/test_modeling_flax_distilbert.py b/tests/models/distilbert/test_modeling_flax_distilbert.py index f4481a6e4a..1f5a402e86 100644 --- a/tests/models/distilbert/test_modeling_flax_distilbert.py +++ b/tests/models/distilbert/test_modeling_flax_distilbert.py @@ -47,7 +47,7 @@ class FlaxDistilBertModelTester(unittest.TestCase): use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/dpr/test_modeling_dpr.py b/tests/models/dpr/test_modeling_dpr.py index cd4f430ded..b6a687a351 100644 --- a/tests/models/dpr/test_modeling_dpr.py +++ b/tests/models/dpr/test_modeling_dpr.py @@ -48,7 +48,7 @@ class DPRModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/dpt/test_modeling_dpt.py b/tests/models/dpt/test_modeling_dpt.py index 62ac20df31..247791ed41 100644 --- a/tests/models/dpt/test_modeling_dpt.py +++ b/tests/models/dpt/test_modeling_dpt.py @@ -53,7 +53,7 @@ class DPTModelTester: is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=4, + num_hidden_layers=2, backbone_out_indices=[0, 1, 2, 3], num_attention_heads=4, intermediate_size=37, @@ -62,7 +62,7 @@ class DPTModelTester: attention_probs_dropout_prob=0.1, initializer_range=0.02, num_labels=3, - neck_hidden_sizes=[16, 16, 32, 32], + neck_hidden_sizes=[16, 32], is_hybrid=False, scope=None, ): diff --git a/tests/models/electra/test_modeling_electra.py b/tests/models/electra/test_modeling_electra.py index 550bc14487..a5d3fa585e 100644 --- a/tests/models/electra/test_modeling_electra.py +++ b/tests/models/electra/test_modeling_electra.py @@ -54,7 +54,7 @@ class ElectraModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/electra/test_modeling_flax_electra.py b/tests/models/electra/test_modeling_flax_electra.py index 0dda4e38fd..19b35d8940 100644 --- a/tests/models/electra/test_modeling_flax_electra.py +++ b/tests/models/electra/test_modeling_flax_electra.py @@ -34,7 +34,7 @@ class FlaxElectraModelTester(unittest.TestCase): vocab_size=99, embedding_size=24, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/ernie/test_modeling_ernie.py b/tests/models/ernie/test_modeling_ernie.py index e845bd1f83..f0bdec3efb 100644 --- a/tests/models/ernie/test_modeling_ernie.py +++ b/tests/models/ernie/test_modeling_ernie.py @@ -56,7 +56,7 @@ class ErnieModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/ernie_m/test_modeling_ernie_m.py b/tests/models/ernie_m/test_modeling_ernie_m.py index 5e0ac95233..1fafcd34ba 100644 --- a/tests/models/ernie_m/test_modeling_ernie_m.py +++ b/tests/models/ernie_m/test_modeling_ernie_m.py @@ -50,7 +50,7 @@ class ErnieMModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/esm/test_modeling_esm.py b/tests/models/esm/test_modeling_esm.py index 2e5d48082b..f242e77966 100644 --- a/tests/models/esm/test_modeling_esm.py +++ b/tests/models/esm/test_modeling_esm.py @@ -49,7 +49,7 @@ class EsmModelTester: use_labels=True, vocab_size=33, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/esm/test_modeling_esmfold.py b/tests/models/esm/test_modeling_esmfold.py index 39f274af54..1ec5ab8acb 100644 --- a/tests/models/esm/test_modeling_esmfold.py +++ b/tests/models/esm/test_modeling_esmfold.py @@ -43,7 +43,7 @@ class EsmFoldModelTester: use_labels=False, vocab_size=19, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/falcon/test_modeling_falcon.py b/tests/models/falcon/test_modeling_falcon.py index 6530eeb1a1..0efc762371 100644 --- a/tests/models/falcon/test_modeling_falcon.py +++ b/tests/models/falcon/test_modeling_falcon.py @@ -50,7 +50,7 @@ class FalconModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/flaubert/test_modeling_flaubert.py b/tests/models/flaubert/test_modeling_flaubert.py index 99dbf927e1..61806182bb 100644 --- a/tests/models/flaubert/test_modeling_flaubert.py +++ b/tests/models/flaubert/test_modeling_flaubert.py @@ -57,7 +57,7 @@ class FlaubertModelTester(object): vocab_size=99, n_special=0, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py index f1221f1061..0224181637 100644 --- a/tests/models/flava/test_modeling_flava.py +++ b/tests/models/flava/test_modeling_flava.py @@ -79,7 +79,7 @@ class FlavaImageModelTester: parent, batch_size=12, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", @@ -342,7 +342,7 @@ class FlavaTextModelTester: max_position_embeddings=512, position_embedding_type="absolute", hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", @@ -487,7 +487,7 @@ class FlavaMultimodalModelTester: seq_length=44, use_input_mask=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/fnet/test_modeling_fnet.py b/tests/models/fnet/test_modeling_fnet.py index e7e592d5b6..01e9942de2 100644 --- a/tests/models/fnet/test_modeling_fnet.py +++ b/tests/models/fnet/test_modeling_fnet.py @@ -70,7 +70,7 @@ class FNetModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, intermediate_size=37, hidden_act="gelu", hidden_dropout_prob=0.1, diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py index ed094db4a0..0dde54a398 100644 --- a/tests/models/git/test_modeling_git.py +++ b/tests/models/git/test_modeling_git.py @@ -51,7 +51,7 @@ class GitVisionModelTester: is_training=True, hidden_size=32, projection_dim=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, @@ -203,7 +203,7 @@ class GitModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/gpt2/test_modeling_flax_gpt2.py b/tests/models/gpt2/test_modeling_flax_gpt2.py index e842bbc732..9bdc17fa19 100644 --- a/tests/models/gpt2/test_modeling_flax_gpt2.py +++ b/tests/models/gpt2/test_modeling_flax_gpt2.py @@ -52,7 +52,7 @@ class FlaxGPT2ModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py index f820b54942..c941039888 100644 --- a/tests/models/gpt2/test_modeling_gpt2.py +++ b/tests/models/gpt2/test_modeling_gpt2.py @@ -56,7 +56,7 @@ class GPT2ModelTester: use_mc_token_ids=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py index 8beddc0aba..3d4dd27fa4 100644 --- a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py +++ b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py @@ -55,7 +55,7 @@ class GPTBigCodeModelTester: use_mc_token_ids=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="relu", diff --git a/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py b/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py index a32f35f6e7..58574a8b1d 100644 --- a/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py +++ b/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py @@ -52,9 +52,9 @@ class FlaxGPTNeoModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=4, - attention_types=[[["global", "local"], 2]], + attention_types=[[["global", "local"], 1]], intermediate_size=37, hidden_act="gelu", hidden_dropout_prob=0.1, diff --git a/tests/models/gpt_neo/test_modeling_gpt_neo.py b/tests/models/gpt_neo/test_modeling_gpt_neo.py index a79cf5b25d..075b9a2663 100644 --- a/tests/models/gpt_neo/test_modeling_gpt_neo.py +++ b/tests/models/gpt_neo/test_modeling_gpt_neo.py @@ -54,8 +54,8 @@ class GPTNeoModelTester: use_mc_token_ids=True, vocab_size=99, hidden_size=32, - num_hidden_layers=4, - attention_types=[[["global", "local"], 2]], + num_hidden_layers=2, + attention_types=[[["global", "local"], 1]], num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/models/gpt_neox/test_modeling_gpt_neox.py index 176970779e..8777bd3abd 100644 --- a/tests/models/gpt_neox/test_modeling_gpt_neox.py +++ b/tests/models/gpt_neox/test_modeling_gpt_neox.py @@ -52,7 +52,7 @@ class GPTNeoXModelTester: use_labels=True, vocab_size=99, hidden_size=64, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py b/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py index 47bb22b627..fc78b8bdd4 100644 --- a/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py +++ b/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py @@ -44,7 +44,7 @@ class GPTNeoXJapaneseModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_multiple_size=4, hidden_act="gelu", diff --git a/tests/models/gptj/test_modeling_flax_gptj.py b/tests/models/gptj/test_modeling_flax_gptj.py index d177e345e8..48061f84d8 100644 --- a/tests/models/gptj/test_modeling_flax_gptj.py +++ b/tests/models/gptj/test_modeling_flax_gptj.py @@ -53,7 +53,7 @@ class FlaxGPTJModelTester: vocab_size=99, hidden_size=32, rotary_dim=4, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/gptj/test_modeling_gptj.py b/tests/models/gptj/test_modeling_gptj.py index 3636d357d5..f0e0270070 100644 --- a/tests/models/gptj/test_modeling_gptj.py +++ b/tests/models/gptj/test_modeling_gptj.py @@ -56,7 +56,7 @@ class GPTJModelTester: vocab_size=99, hidden_size=32, rotary_dim=4, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py b/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py index 54a98cf70f..1a86e23fdc 100644 --- a/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py +++ b/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py @@ -45,7 +45,7 @@ class GPTSanJapaneseTester: is_training=True, hidden_size=32, ext_size=42, - num_hidden_layers=5, + num_hidden_layers=2, num_ext_layers=2, num_attention_heads=4, num_experts=2, diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py index 261841277a..6d52b6b501 100644 --- a/tests/models/groupvit/test_modeling_groupvit.py +++ b/tests/models/groupvit/test_modeling_groupvit.py @@ -356,7 +356,7 @@ class GroupViTTextModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, @@ -553,6 +553,10 @@ class GroupViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase def test_model_common_attributes(self): pass + # overwritten from parent as this equivalent test needs a specific `seed` and hard to get a good one! + def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=2e-5, name="outputs", attributes=None): + super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol=tol, name=name, attributes=attributes) + @is_pt_tf_cross_test def test_pt_tf_model_equivalence(self): import tensorflow as tf diff --git a/tests/models/hubert/test_modeling_hubert.py b/tests/models/hubert/test_modeling_hubert.py index bad1a561da..c5a6a1398f 100644 --- a/tests/models/hubert/test_modeling_hubert.py +++ b/tests/models/hubert/test_modeling_hubert.py @@ -71,7 +71,7 @@ class HubertModelTester: conv_bias=False, num_conv_pos_embeddings=16, num_conv_pos_embedding_groups=2, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, hidden_dropout_prob=0.1, # this is most likely not correctly set yet intermediate_size=20, diff --git a/tests/models/ibert/test_modeling_ibert.py b/tests/models/ibert/test_modeling_ibert.py index 9f2f2c9502..096a55169a 100644 --- a/tests/models/ibert/test_modeling_ibert.py +++ b/tests/models/ibert/test_modeling_ibert.py @@ -62,7 +62,7 @@ class IBertModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/imagegpt/test_modeling_imagegpt.py b/tests/models/imagegpt/test_modeling_imagegpt.py index 19fe688bf4..b4e2cd5ab4 100644 --- a/tests/models/imagegpt/test_modeling_imagegpt.py +++ b/tests/models/imagegpt/test_modeling_imagegpt.py @@ -65,7 +65,7 @@ class ImageGPTModelTester: use_mc_token_ids=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py index d659c38916..49d780918c 100644 --- a/tests/models/instructblip/test_modeling_instructblip.py +++ b/tests/models/instructblip/test_modeling_instructblip.py @@ -64,7 +64,7 @@ class InstructBlipVisionModelTester: is_training=True, hidden_size=32, projection_dim=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, @@ -219,7 +219,7 @@ class InstructBlipQFormerModelTester: vocab_size=99, hidden_size=32, projection_dim=32, - num_hidden_layers=6, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, @@ -295,7 +295,7 @@ class InstructBlipTextModelDecoderOnlyTester: use_labels=False, vocab_size=99, hidden_size=16, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=4, hidden_act="gelu", diff --git a/tests/models/layoutlm/test_modeling_layoutlm.py b/tests/models/layoutlm/test_modeling_layoutlm.py index 0535fbf4e1..aafa53969d 100644 --- a/tests/models/layoutlm/test_modeling_layoutlm.py +++ b/tests/models/layoutlm/test_modeling_layoutlm.py @@ -48,7 +48,7 @@ class LayoutLMModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py index c8457331c5..cffa09d6d0 100644 --- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py @@ -55,7 +55,7 @@ class LayoutLMv2ModelTester: use_labels=True, vocab_size=99, hidden_size=36, - num_hidden_layers=3, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py index 2c3aef9b93..bf9a0b8314 100644 --- a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py +++ b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py @@ -63,7 +63,7 @@ class LayoutLMv3ModelTester: use_labels=True, vocab_size=99, hidden_size=36, - num_hidden_layers=3, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py index c2efc3f5a3..e8d5f9abe6 100644 --- a/tests/models/llama/test_modeling_llama.py +++ b/tests/models/llama/test_modeling_llama.py @@ -46,7 +46,7 @@ class LlamaModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/longformer/test_modeling_longformer.py b/tests/models/longformer/test_modeling_longformer.py index 21853e4420..b40e464e60 100644 --- a/tests/models/longformer/test_modeling_longformer.py +++ b/tests/models/longformer/test_modeling_longformer.py @@ -50,7 +50,7 @@ class LongformerModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/longt5/test_modeling_flax_longt5.py b/tests/models/longt5/test_modeling_flax_longt5.py index 2c262bef30..9449cfa5e3 100644 --- a/tests/models/longt5/test_modeling_flax_longt5.py +++ b/tests/models/longt5/test_modeling_flax_longt5.py @@ -71,7 +71,7 @@ class FlaxLongT5ModelTester: use_attention_mask=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, d_ff=37, relative_attention_num_buckets=8, diff --git a/tests/models/longt5/test_modeling_longt5.py b/tests/models/longt5/test_modeling_longt5.py index 0f7ae0a272..b2d17dc0e6 100644 --- a/tests/models/longt5/test_modeling_longt5.py +++ b/tests/models/longt5/test_modeling_longt5.py @@ -59,7 +59,7 @@ class LongT5ModelTester: use_attention_mask=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, d_ff=37, relative_attention_num_buckets=8, @@ -916,7 +916,7 @@ class LongT5EncoderOnlyModelTester: # For common tests use_attention_mask=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, d_ff=37, relative_attention_num_buckets=8, diff --git a/tests/models/luke/test_modeling_luke.py b/tests/models/luke/test_modeling_luke.py index 35bdb6b6d5..95cb4f5d01 100644 --- a/tests/models/luke/test_modeling_luke.py +++ b/tests/models/luke/test_modeling_luke.py @@ -61,7 +61,7 @@ class LukeModelTester: entity_vocab_size=10, entity_emb_size=6, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/marian/test_modeling_marian.py b/tests/models/marian/test_modeling_marian.py index 6cbcd55d3f..8fd5e04a56 100644 --- a/tests/models/marian/test_modeling_marian.py +++ b/tests/models/marian/test_modeling_marian.py @@ -661,7 +661,7 @@ class MarianStandaloneDecoderModelTester: use_labels=True, decoder_start_token_id=2, decoder_ffn_dim=32, - decoder_layers=4, + decoder_layers=2, encoder_attention_heads=4, decoder_attention_heads=4, max_position_embeddings=30, diff --git a/tests/models/markuplm/test_modeling_markuplm.py b/tests/models/markuplm/test_modeling_markuplm.py index 09d2f1ad52..71757385e8 100644 --- a/tests/models/markuplm/test_modeling_markuplm.py +++ b/tests/models/markuplm/test_modeling_markuplm.py @@ -53,7 +53,7 @@ class MarkupLMModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/mbart/test_modeling_mbart.py b/tests/models/mbart/test_modeling_mbart.py index ec3d36f33d..db5b554e82 100644 --- a/tests/models/mbart/test_modeling_mbart.py +++ b/tests/models/mbart/test_modeling_mbart.py @@ -491,7 +491,7 @@ class MBartStandaloneDecoderModelTester: use_labels=True, decoder_start_token_id=2, decoder_ffn_dim=32, - decoder_layers=4, + decoder_layers=2, encoder_attention_heads=4, decoder_attention_heads=4, max_position_embeddings=30, diff --git a/tests/models/mega/test_modeling_mega.py b/tests/models/mega/test_modeling_mega.py index dfb00d190f..e10ecc5487 100644 --- a/tests/models/mega/test_modeling_mega.py +++ b/tests/models/mega/test_modeling_mega.py @@ -51,7 +51,7 @@ class MegaModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, intermediate_size=37, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, diff --git a/tests/models/megatron_bert/test_modeling_megatron_bert.py b/tests/models/megatron_bert/test_modeling_megatron_bert.py index bc1d81c4e0..818f65d80c 100644 --- a/tests/models/megatron_bert/test_modeling_megatron_bert.py +++ b/tests/models/megatron_bert/test_modeling_megatron_bert.py @@ -58,7 +58,7 @@ class MegatronBertModelTester: vocab_size=99, hidden_size=64, embedding_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/mgp_str/test_modeling_mgp_str.py b/tests/models/mgp_str/test_modeling_mgp_str.py index 1d972e22a3..d8ba50a350 100644 --- a/tests/models/mgp_str/test_modeling_mgp_str.py +++ b/tests/models/mgp_str/test_modeling_mgp_str.py @@ -55,7 +55,7 @@ class MgpstrModelTester: num_bpe_labels=99, num_wordpiece_labels=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, mlp_ratio=4.0, patch_embeds_hidden_size=257, diff --git a/tests/models/mobilebert/test_modeling_mobilebert.py b/tests/models/mobilebert/test_modeling_mobilebert.py index 6e4f696b8b..572490071a 100644 --- a/tests/models/mobilebert/test_modeling_mobilebert.py +++ b/tests/models/mobilebert/test_modeling_mobilebert.py @@ -54,7 +54,7 @@ class MobileBertModelTester: vocab_size=99, hidden_size=64, embedding_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/mpnet/test_modeling_mpnet.py b/tests/models/mpnet/test_modeling_mpnet.py index d3261e4bc0..fc16764174 100644 --- a/tests/models/mpnet/test_modeling_mpnet.py +++ b/tests/models/mpnet/test_modeling_mpnet.py @@ -49,7 +49,7 @@ class MPNetModelTester: use_labels=True, vocab_size=99, hidden_size=64, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=64, hidden_act="gelu", diff --git a/tests/models/mpt/test_modeling_mpt.py b/tests/models/mpt/test_modeling_mpt.py index 91cb35bb7a..363c493b1e 100644 --- a/tests/models/mpt/test_modeling_mpt.py +++ b/tests/models/mpt/test_modeling_mpt.py @@ -54,7 +54,7 @@ class MptModelTester: use_mc_token_ids=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/mra/test_modeling_mra.py b/tests/models/mra/test_modeling_mra.py index c6a0813032..aac9ce5bc1 100644 --- a/tests/models/mra/test_modeling_mra.py +++ b/tests/models/mra/test_modeling_mra.py @@ -51,7 +51,7 @@ class MraModelTester: use_labels=True, vocab_size=99, hidden_size=16, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=2, intermediate_size=36, hidden_act="gelu", diff --git a/tests/models/mvp/test_modeling_mvp.py b/tests/models/mvp/test_modeling_mvp.py index cc3986a370..8e6143529a 100644 --- a/tests/models/mvp/test_modeling_mvp.py +++ b/tests/models/mvp/test_modeling_mvp.py @@ -595,7 +595,7 @@ class MvpStandaloneDecoderModelTester: use_labels=True, decoder_start_token_id=2, decoder_ffn_dim=32, - decoder_layers=4, + decoder_layers=2, encoder_attention_heads=4, decoder_attention_heads=4, max_position_embeddings=30, diff --git a/tests/models/nezha/test_modeling_nezha.py b/tests/models/nezha/test_modeling_nezha.py index 5b36ffbc96..a71823d8a5 100644 --- a/tests/models/nezha/test_modeling_nezha.py +++ b/tests/models/nezha/test_modeling_nezha.py @@ -55,7 +55,7 @@ class NezhaModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/nllb_moe/test_modeling_nllb_moe.py b/tests/models/nllb_moe/test_modeling_nllb_moe.py index 030b5f2a89..9311a01990 100644 --- a/tests/models/nllb_moe/test_modeling_nllb_moe.py +++ b/tests/models/nllb_moe/test_modeling_nllb_moe.py @@ -52,7 +52,7 @@ class NllbMoeModelTester: use_labels=False, vocab_size=99, hidden_size=16, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=4, hidden_act="relu", diff --git a/tests/models/nystromformer/test_modeling_nystromformer.py b/tests/models/nystromformer/test_modeling_nystromformer.py index 390308631d..ae06670103 100644 --- a/tests/models/nystromformer/test_modeling_nystromformer.py +++ b/tests/models/nystromformer/test_modeling_nystromformer.py @@ -51,7 +51,7 @@ class NystromformerModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/openai/test_modeling_openai.py b/tests/models/openai/test_modeling_openai.py index 0e8ba6d9ce..98d74ee5f8 100644 --- a/tests/models/openai/test_modeling_openai.py +++ b/tests/models/openai/test_modeling_openai.py @@ -49,7 +49,7 @@ class OpenAIGPTModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/opt/test_modeling_opt.py b/tests/models/opt/test_modeling_opt.py index 251282a91b..69a063f276 100644 --- a/tests/models/opt/test_modeling_opt.py +++ b/tests/models/opt/test_modeling_opt.py @@ -70,7 +70,7 @@ class OPTModelTester: use_labels=False, vocab_size=99, hidden_size=16, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=4, hidden_act="gelu", diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py index 4dbd1fb0a8..8360b9f2a2 100644 --- a/tests/models/owlvit/test_modeling_owlvit.py +++ b/tests/models/owlvit/test_modeling_owlvit.py @@ -62,7 +62,7 @@ class OwlViTVisionModelTester: num_channels=3, is_training=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, diff --git a/tests/models/pegasus/test_modeling_flax_pegasus.py b/tests/models/pegasus/test_modeling_flax_pegasus.py index fbc49c7811..62b9077f0d 100644 --- a/tests/models/pegasus/test_modeling_flax_pegasus.py +++ b/tests/models/pegasus/test_modeling_flax_pegasus.py @@ -52,7 +52,7 @@ class FlaxPegasusModelTester: use_labels=False, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_dropout_prob=0.1, diff --git a/tests/models/pegasus/test_modeling_pegasus.py b/tests/models/pegasus/test_modeling_pegasus.py index bde7477f94..4011fe2c68 100644 --- a/tests/models/pegasus/test_modeling_pegasus.py +++ b/tests/models/pegasus/test_modeling_pegasus.py @@ -371,7 +371,7 @@ class PegasusStandaloneDecoderModelTester: use_labels=True, decoder_start_token_id=2, decoder_ffn_dim=32, - decoder_layers=4, + decoder_layers=2, encoder_attention_heads=4, decoder_attention_heads=4, max_position_embeddings=30, diff --git a/tests/models/pegasus_x/test_modeling_pegasus_x.py b/tests/models/pegasus_x/test_modeling_pegasus_x.py index 73c4ee62bf..22d7b0c863 100644 --- a/tests/models/pegasus_x/test_modeling_pegasus_x.py +++ b/tests/models/pegasus_x/test_modeling_pegasus_x.py @@ -670,7 +670,7 @@ class PegasusXStandaloneDecoderModelTester: use_labels=True, decoder_start_token_id=2, decoder_ffn_dim=32, - decoder_layers=4, + decoder_layers=2, encoder_attention_heads=4, decoder_attention_heads=4, max_position_embeddings=30, diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py index c49db3dcff..34ca767d6b 100644 --- a/tests/models/pix2struct/test_modeling_pix2struct.py +++ b/tests/models/pix2struct/test_modeling_pix2struct.py @@ -71,7 +71,7 @@ class Pix2StructVisionModelTester: patch_embed_hidden_size=12, projection_dim=32, max_patches=64, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, @@ -230,7 +230,7 @@ class Pix2StructTextModelTester: vocab_size=99, hidden_size=12, projection_dim=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, diff --git a/tests/models/plbart/test_modeling_plbart.py b/tests/models/plbart/test_modeling_plbart.py index 05dbac6a2c..4cd8ecd14f 100644 --- a/tests/models/plbart/test_modeling_plbart.py +++ b/tests/models/plbart/test_modeling_plbart.py @@ -473,7 +473,7 @@ class PLBartStandaloneDecoderModelTester: use_labels=True, decoder_start_token_id=2, decoder_ffn_dim=32, - decoder_layers=4, + decoder_layers=2, encoder_attention_heads=4, decoder_attention_heads=4, max_position_embeddings=30, diff --git a/tests/models/prophetnet/test_modeling_prophetnet.py b/tests/models/prophetnet/test_modeling_prophetnet.py index fa717b2743..eee03134d3 100644 --- a/tests/models/prophetnet/test_modeling_prophetnet.py +++ b/tests/models/prophetnet/test_modeling_prophetnet.py @@ -55,10 +55,10 @@ class ProphetNetModelTester: use_labels=True, decoder_start_token_id=0, encoder_ffn_dim=32, - num_encoder_layers=4, + num_encoder_layers=2, num_encoder_attention_heads=4, decoder_ffn_dim=32, - num_decoder_layers=4, + num_decoder_layers=2, num_decoder_attention_heads=4, max_position_embeddings=30, is_encoder_decoder=True, @@ -437,10 +437,10 @@ class ProphetNetModelTester: decoder_attention_mask=decoder_attention_mask, labels=lm_labels, ) - self.parent.assertTrue(torch.allclose(result.loss, torch.tensor(4.5981, device=torch_device), atol=1e-3)) + self.parent.assertTrue(torch.allclose(result.loss, torch.tensor(4.5892, device=torch_device), atol=1e-3)) expected_logit_slice = torch.tensor( - [-0.0648, 0.0790, 0.0360, 0.0089, 0.0039, -0.0639, 0.0131], device=torch_device + [-0.0184, 0.0758, -0.0543, -0.0093, 0.0050, -0.0660, -0.1453], device=torch_device ) self.parent.assertTrue(torch.allclose(result.logits[0, :, 1], expected_logit_slice, atol=1e-3)) @@ -551,10 +551,10 @@ class ProphetNetStandaloneDecoderModelTester: use_labels=True, decoder_start_token_id=0, encoder_ffn_dim=32, - num_encoder_layers=4, + num_encoder_layers=2, num_encoder_attention_heads=4, decoder_ffn_dim=32, - num_decoder_layers=4, + num_decoder_layers=2, num_decoder_attention_heads=4, max_position_embeddings=30, is_encoder_decoder=False, @@ -782,10 +782,10 @@ class ProphetNetStandaloneEncoderModelTester: use_labels=True, decoder_start_token_id=0, encoder_ffn_dim=32, - num_encoder_layers=4, + num_encoder_layers=2, num_encoder_attention_heads=4, decoder_ffn_dim=32, - num_decoder_layers=4, + num_decoder_layers=2, num_decoder_attention_heads=4, max_position_embeddings=30, is_encoder_decoder=False, diff --git a/tests/models/qdqbert/test_modeling_qdqbert.py b/tests/models/qdqbert/test_modeling_qdqbert.py index cc05389eea..d10abb733e 100644 --- a/tests/models/qdqbert/test_modeling_qdqbert.py +++ b/tests/models/qdqbert/test_modeling_qdqbert.py @@ -54,7 +54,7 @@ class QDQBertModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/realm/test_modeling_realm.py b/tests/models/realm/test_modeling_realm.py index ddd6c26450..4d6d9fd0ff 100644 --- a/tests/models/realm/test_modeling_realm.py +++ b/tests/models/realm/test_modeling_realm.py @@ -54,7 +54,7 @@ class RealmModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/rembert/test_modeling_rembert.py b/tests/models/rembert/test_modeling_rembert.py index 4e6754b2e5..557a42243d 100644 --- a/tests/models/rembert/test_modeling_rembert.py +++ b/tests/models/rembert/test_modeling_rembert.py @@ -55,7 +55,7 @@ class RemBertModelTester: hidden_size=32, input_embedding_size=18, output_embedding_size=43, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/roberta/test_modeling_flax_roberta.py b/tests/models/roberta/test_modeling_flax_roberta.py index c325e295f6..f82479aa70 100644 --- a/tests/models/roberta/test_modeling_flax_roberta.py +++ b/tests/models/roberta/test_modeling_flax_roberta.py @@ -46,7 +46,7 @@ class FlaxRobertaModelTester(unittest.TestCase): use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/roberta/test_modeling_roberta.py b/tests/models/roberta/test_modeling_roberta.py index 7ca78e23b7..40c85123c4 100644 --- a/tests/models/roberta/test_modeling_roberta.py +++ b/tests/models/roberta/test_modeling_roberta.py @@ -58,7 +58,7 @@ class RobertaModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py index 3f15ca9ff3..8500dfcb67 100644 --- a/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py +++ b/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py @@ -49,7 +49,7 @@ class FlaxRobertaPreLayerNormModelTester(unittest.TestCase): use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py index 4e4915147f..c44e1613b2 100644 --- a/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py +++ b/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py @@ -57,7 +57,7 @@ class RobertaPreLayerNormModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/roc_bert/test_modeling_roc_bert.py b/tests/models/roc_bert/test_modeling_roc_bert.py index 2efd9b7992..d1caca6b6f 100644 --- a/tests/models/roc_bert/test_modeling_roc_bert.py +++ b/tests/models/roc_bert/test_modeling_roc_bert.py @@ -58,7 +58,7 @@ class RoCBertModelTester: pronunciation_embed_dim=32, shape_embed_dim=32, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/roformer/test_modeling_flax_roformer.py b/tests/models/roformer/test_modeling_flax_roformer.py index 28d0ffba95..8364e121b4 100644 --- a/tests/models/roformer/test_modeling_flax_roformer.py +++ b/tests/models/roformer/test_modeling_flax_roformer.py @@ -47,7 +47,7 @@ class FlaxRoFormerModelTester(unittest.TestCase): use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/roformer/test_modeling_roformer.py b/tests/models/roformer/test_modeling_roformer.py index 357e126a04..e54d31d154 100644 --- a/tests/models/roformer/test_modeling_roformer.py +++ b/tests/models/roformer/test_modeling_roformer.py @@ -56,7 +56,7 @@ class RoFormerModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/rwkv/test_modeling_rwkv.py b/tests/models/rwkv/test_modeling_rwkv.py index 2b9cc47133..4ca5cfdf9e 100644 --- a/tests/models/rwkv/test_modeling_rwkv.py +++ b/tests/models/rwkv/test_modeling_rwkv.py @@ -52,7 +52,7 @@ class RwkvModelTester: use_mc_token_ids=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, intermediate_size=37, hidden_act="gelu", hidden_dropout_prob=0.1, diff --git a/tests/models/sew/test_modeling_sew.py b/tests/models/sew/test_modeling_sew.py index 651600c437..876b232a11 100644 --- a/tests/models/sew/test_modeling_sew.py +++ b/tests/models/sew/test_modeling_sew.py @@ -65,7 +65,7 @@ class SEWModelTester: num_conv_pos_embeddings=31, num_conv_pos_embedding_groups=2, squeeze_factor=2, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, hidden_dropout=0.1, intermediate_size=20, diff --git a/tests/models/sew_d/test_modeling_sew_d.py b/tests/models/sew_d/test_modeling_sew_d.py index 9aa4b8edac..dc33e80ede 100644 --- a/tests/models/sew_d/test_modeling_sew_d.py +++ b/tests/models/sew_d/test_modeling_sew_d.py @@ -72,7 +72,7 @@ class SEWDModelTester: position_biased_input=False, pos_att_type=("p2c", "c2p"), norm_rel_ebd="layer_norm", - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, hidden_dropout=0.1, intermediate_size=20, diff --git a/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py b/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py index ccd5bfa189..cbb449c6e7 100644 --- a/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py +++ b/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py @@ -50,7 +50,7 @@ class Speech2Text2StandaloneDecoderModelTester: use_labels=True, decoder_start_token_id=2, decoder_ffn_dim=32, - decoder_layers=4, + decoder_layers=2, decoder_attention_heads=4, max_position_embeddings=30, pad_token_id=0, diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py index c357259d78..9324996ffe 100644 --- a/tests/models/speecht5/test_modeling_speecht5.py +++ b/tests/models/speecht5/test_modeling_speecht5.py @@ -105,7 +105,7 @@ class SpeechT5ModelTester: is_training=False, vocab_size=81, hidden_size=24, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, intermediate_size=4, ): @@ -249,7 +249,7 @@ class SpeechT5ForSpeechToTextTester: decoder_seq_length=7, is_training=False, hidden_size=24, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, intermediate_size=4, conv_dim=(32, 32, 32), @@ -786,7 +786,7 @@ class SpeechT5ForTextToSpeechTester: decoder_seq_length=1024, # speech is longer is_training=False, hidden_size=24, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, intermediate_size=4, vocab_size=81, @@ -1031,7 +1031,7 @@ class SpeechT5ForSpeechToSpeechTester: decoder_seq_length=1024, is_training=False, hidden_size=24, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, intermediate_size=4, conv_dim=(32, 32, 32), diff --git a/tests/models/splinter/test_modeling_splinter.py b/tests/models/splinter/test_modeling_splinter.py index 24a0753157..90ee07c354 100644 --- a/tests/models/splinter/test_modeling_splinter.py +++ b/tests/models/splinter/test_modeling_splinter.py @@ -46,7 +46,7 @@ class SplinterModelTester: vocab_size=99, hidden_size=32, question_token_id=1, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/squeezebert/test_modeling_squeezebert.py b/tests/models/squeezebert/test_modeling_squeezebert.py index 5efb030311..bf86792f57 100644 --- a/tests/models/squeezebert/test_modeling_squeezebert.py +++ b/tests/models/squeezebert/test_modeling_squeezebert.py @@ -50,7 +50,7 @@ class SqueezeBertModelTester(object): use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=64, hidden_act="gelu", diff --git a/tests/models/switch_transformers/test_modeling_switch_transformers.py b/tests/models/switch_transformers/test_modeling_switch_transformers.py index abe785eca0..ae9966a1db 100644 --- a/tests/models/switch_transformers/test_modeling_switch_transformers.py +++ b/tests/models/switch_transformers/test_modeling_switch_transformers.py @@ -58,7 +58,7 @@ class SwitchTransformersModelTester: use_attention_mask=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, d_ff=37, relative_attention_num_buckets=8, @@ -826,7 +826,7 @@ class SwitchTransformersEncoderOnlyModelTester: # For common tests use_attention_mask=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, d_ff=37, relative_attention_num_buckets=8, diff --git a/tests/models/t5/test_modeling_flax_t5.py b/tests/models/t5/test_modeling_flax_t5.py index a2a80ab25b..d5d729dac9 100644 --- a/tests/models/t5/test_modeling_flax_t5.py +++ b/tests/models/t5/test_modeling_flax_t5.py @@ -70,7 +70,7 @@ class FlaxT5ModelTester: use_attention_mask=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, d_ff=37, relative_attention_num_buckets=8, @@ -477,7 +477,7 @@ class FlaxT5EncoderOnlyModelTester: use_attention_mask=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, d_ff=37, relative_attention_num_buckets=8, diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py index 41aade960e..cae891ef8b 100644 --- a/tests/models/t5/test_modeling_t5.py +++ b/tests/models/t5/test_modeling_t5.py @@ -71,7 +71,7 @@ class T5ModelTester: use_attention_mask=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, d_ff=37, relative_attention_num_buckets=8, @@ -902,7 +902,7 @@ class T5EncoderOnlyModelTester: # For common tests use_attention_mask=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, d_ff=37, relative_attention_num_buckets=8, diff --git a/tests/models/tapas/test_modeling_tapas.py b/tests/models/tapas/test_modeling_tapas.py index 619a5d2612..6a482d03be 100644 --- a/tests/models/tapas/test_modeling_tapas.py +++ b/tests/models/tapas/test_modeling_tapas.py @@ -79,7 +79,7 @@ class TapasModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/timesformer/test_modeling_timesformer.py b/tests/models/timesformer/test_modeling_timesformer.py index 2783a65ced..2b7a5e279f 100644 --- a/tests/models/timesformer/test_modeling_timesformer.py +++ b/tests/models/timesformer/test_modeling_timesformer.py @@ -60,7 +60,7 @@ class TimesformerModelTester: is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/transfo_xl/test_modeling_transfo_xl.py b/tests/models/transfo_xl/test_modeling_transfo_xl.py index 970f87bf10..63afd438d9 100644 --- a/tests/models/transfo_xl/test_modeling_transfo_xl.py +++ b/tests/models/transfo_xl/test_modeling_transfo_xl.py @@ -52,7 +52,7 @@ class TransfoXLModelTester: d_head=8, d_inner=128, div_val=2, - num_hidden_layers=5, + num_hidden_layers=2, scope=None, seed=1, eos_token_id=0, diff --git a/tests/models/trocr/test_modeling_trocr.py b/tests/models/trocr/test_modeling_trocr.py index 0033f339ae..da24c7dd43 100644 --- a/tests/models/trocr/test_modeling_trocr.py +++ b/tests/models/trocr/test_modeling_trocr.py @@ -47,7 +47,7 @@ class TrOCRStandaloneDecoderModelTester: use_labels=True, decoder_start_token_id=2, decoder_ffn_dim=32, - decoder_layers=4, + decoder_layers=2, decoder_attention_heads=4, max_position_embeddings=30, pad_token_id=0, diff --git a/tests/models/tvlt/test_modeling_tvlt.py b/tests/models/tvlt/test_modeling_tvlt.py index e437b2651e..3ee7f7adc7 100644 --- a/tests/models/tvlt/test_modeling_tvlt.py +++ b/tests/models/tvlt/test_modeling_tvlt.py @@ -68,7 +68,7 @@ class TvltModelTester: num_audio_channels=1, num_frames=2, hidden_size=32, - num_hidden_layers=3, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=128, hidden_act="gelu", diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py index 29f8502179..d9fd852c88 100644 --- a/tests/models/umt5/test_modeling_umt5.py +++ b/tests/models/umt5/test_modeling_umt5.py @@ -64,7 +64,7 @@ class UMT5ModelTester: use_attention_mask=True, use_labels=False, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, d_ff=37, relative_attention_num_buckets=8, diff --git a/tests/models/unispeech/test_modeling_unispeech.py b/tests/models/unispeech/test_modeling_unispeech.py index 6d0bd1bf1f..ac770bdbb6 100644 --- a/tests/models/unispeech/test_modeling_unispeech.py +++ b/tests/models/unispeech/test_modeling_unispeech.py @@ -65,7 +65,7 @@ class UniSpeechModelTester: conv_bias=False, num_conv_pos_embeddings=16, num_conv_pos_embedding_groups=2, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, hidden_dropout_prob=0.1, # this is most likely not correctly set yet intermediate_size=20, diff --git a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py index a418a56dad..9c8cffba9a 100644 --- a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py +++ b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py @@ -67,7 +67,7 @@ class UniSpeechSatModelTester: conv_bias=False, num_conv_pos_embeddings=16, num_conv_pos_embedding_groups=2, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, hidden_dropout_prob=0.1, # this is most likely not correctly set yet intermediate_size=20, diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py index 85a0d2714e..9fb9c9e7f3 100644 --- a/tests/models/videomae/test_modeling_videomae.py +++ b/tests/models/videomae/test_modeling_videomae.py @@ -62,7 +62,7 @@ class VideoMAEModelTester: is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/vilt/test_modeling_vilt.py b/tests/models/vilt/test_modeling_vilt.py index 772091d5b9..4aa036ebb6 100644 --- a/tests/models/vilt/test_modeling_vilt.py +++ b/tests/models/vilt/test_modeling_vilt.py @@ -65,7 +65,7 @@ class ViltModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/visual_bert/test_modeling_visual_bert.py b/tests/models/visual_bert/test_modeling_visual_bert.py index cf48fd7ffb..9000be33ab 100644 --- a/tests/models/visual_bert/test_modeling_visual_bert.py +++ b/tests/models/visual_bert/test_modeling_visual_bert.py @@ -54,7 +54,7 @@ class VisualBertModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/vit/test_modeling_flax_vit.py b/tests/models/vit/test_modeling_flax_vit.py index ca3130493e..af56f4717b 100644 --- a/tests/models/vit/test_modeling_flax_vit.py +++ b/tests/models/vit/test_modeling_flax_vit.py @@ -41,7 +41,7 @@ class FlaxViTModelTester(unittest.TestCase): is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/vit/test_modeling_vit.py b/tests/models/vit/test_modeling_vit.py index 67c6e4acb1..82ba910ec8 100644 --- a/tests/models/vit/test_modeling_vit.py +++ b/tests/models/vit/test_modeling_vit.py @@ -59,7 +59,7 @@ class ViTModelTester: is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py index fc816750e7..20747b2d54 100644 --- a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py +++ b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py @@ -50,7 +50,7 @@ class ViTHybridModelTester: is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py index 3cedb0c176..89a3a0d803 100644 --- a/tests/models/vit_mae/test_modeling_vit_mae.py +++ b/tests/models/vit_mae/test_modeling_vit_mae.py @@ -56,7 +56,7 @@ class ViTMAEModelTester: is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/vit_msn/test_modeling_vit_msn.py b/tests/models/vit_msn/test_modeling_vit_msn.py index 173dca0913..a531637751 100644 --- a/tests/models/vit_msn/test_modeling_vit_msn.py +++ b/tests/models/vit_msn/test_modeling_vit_msn.py @@ -52,7 +52,7 @@ class ViTMSNModelTester: is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py index b9b52dc121..4cff7dca41 100644 --- a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py @@ -123,7 +123,7 @@ class FlaxWav2Vec2ModelTester: conv_bias=False, num_conv_pos_embeddings=16, num_conv_pos_embedding_groups=2, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, hidden_dropout_prob=0.1, # this is most likely not correctly set yet intermediate_size=20, diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py index 630a5d8e85..fb639077b6 100644 --- a/tests/models/wav2vec2/test_modeling_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py @@ -153,7 +153,7 @@ class Wav2Vec2ModelTester: conv_bias=False, num_conv_pos_embeddings=16, num_conv_pos_embedding_groups=2, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, hidden_dropout_prob=0.1, # this is most likely not correctly set yet intermediate_size=20, diff --git a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py index 8c26268c6d..a79e8ac1ea 100644 --- a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py +++ b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py @@ -71,7 +71,7 @@ class Wav2Vec2ConformerModelTester: conv_bias=False, num_conv_pos_embeddings=16, num_conv_pos_embedding_groups=2, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, hidden_dropout_prob=0.1, intermediate_size=20, diff --git a/tests/models/wavlm/test_modeling_wavlm.py b/tests/models/wavlm/test_modeling_wavlm.py index b04a96dd1c..05385b68b0 100644 --- a/tests/models/wavlm/test_modeling_wavlm.py +++ b/tests/models/wavlm/test_modeling_wavlm.py @@ -64,7 +64,7 @@ class WavLMModelTester: conv_bias=False, num_conv_pos_embeddings=16, num_conv_pos_embedding_groups=2, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, hidden_dropout_prob=0.1, # this is most likely not correctly set yet intermediate_size=20, diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py index ef2d11ac62..5c602d3d3e 100644 --- a/tests/models/x_clip/test_modeling_x_clip.py +++ b/tests/models/x_clip/test_modeling_x_clip.py @@ -61,7 +61,7 @@ class XCLIPVisionModelTester: num_frames=8, # important; the batch size * time must be divisible by the number of frames is_training=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, mit_hidden_size=64, @@ -318,7 +318,7 @@ class XCLIPTextModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, dropout=0.1, diff --git a/tests/models/xglm/test_modeling_flax_xglm.py b/tests/models/xglm/test_modeling_flax_xglm.py index 60436cb1f9..8f1c9a5e2a 100644 --- a/tests/models/xglm/test_modeling_flax_xglm.py +++ b/tests/models/xglm/test_modeling_flax_xglm.py @@ -53,7 +53,7 @@ class FlaxXGLMModelTester: use_labels=True, vocab_size=99, d_model=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, ffn_dim=37, activation_function="gelu", diff --git a/tests/models/xglm/test_modeling_xglm.py b/tests/models/xglm/test_modeling_xglm.py index bbb87abe6d..e6c013cca1 100644 --- a/tests/models/xglm/test_modeling_xglm.py +++ b/tests/models/xglm/test_modeling_xglm.py @@ -44,7 +44,7 @@ class XGLMModelTester: use_labels=True, vocab_size=99, d_model=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, ffn_dim=37, activation_function="gelu", diff --git a/tests/models/xlm/test_modeling_xlm.py b/tests/models/xlm/test_modeling_xlm.py index d8a1844114..b551e7e645 100644 --- a/tests/models/xlm/test_modeling_xlm.py +++ b/tests/models/xlm/test_modeling_xlm.py @@ -57,7 +57,7 @@ class XLMModelTester: vocab_size=99, n_special=0, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, diff --git a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py index 7ec84c9b1f..828d6a02a6 100644 --- a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py +++ b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py @@ -55,7 +55,7 @@ class XLMRobertaXLModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/xlnet/test_modeling_xlnet.py b/tests/models/xlnet/test_modeling_xlnet.py index 2b3f4752ee..2b0c95cd6d 100644 --- a/tests/models/xlnet/test_modeling_xlnet.py +++ b/tests/models/xlnet/test_modeling_xlnet.py @@ -56,7 +56,7 @@ class XLNetModelTester: hidden_size=32, num_attention_heads=4, d_inner=128, - num_hidden_layers=5, + num_hidden_layers=2, type_sequence_label_size=2, untie_r=True, bi_data=False, diff --git a/tests/models/xmod/test_modeling_xmod.py b/tests/models/xmod/test_modeling_xmod.py index 5845dee744..fc1ce44e35 100644 --- a/tests/models/xmod/test_modeling_xmod.py +++ b/tests/models/xmod/test_modeling_xmod.py @@ -51,7 +51,7 @@ class XmodModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/yolos/test_modeling_yolos.py b/tests/models/yolos/test_modeling_yolos.py index 7c49bb864e..c1fb50e30b 100644 --- a/tests/models/yolos/test_modeling_yolos.py +++ b/tests/models/yolos/test_modeling_yolos.py @@ -52,7 +52,7 @@ class YolosModelTester: is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/yoso/test_modeling_yoso.py b/tests/models/yoso/test_modeling_yoso.py index e275e19e40..67d7b9edc4 100644 --- a/tests/models/yoso/test_modeling_yoso.py +++ b/tests/models/yoso/test_modeling_yoso.py @@ -51,7 +51,7 @@ class YosoModelTester: use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index d80624b718..0cdc94fc8e 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -1017,7 +1017,8 @@ class ModelTesterMixin: attentions = outputs[-1] self.assertEqual(attentions[0].shape[-3], 1) - self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) + # TODO: To have this check, we will need at least 3 layers. Do we really need it? + # self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1) def test_head_pruning_save_load_from_pretrained(self): @@ -1053,7 +1054,8 @@ class ModelTesterMixin: outputs = model(**self._prepare_for_class(inputs_dict, model_class)) attentions = outputs[-1] self.assertEqual(attentions[0].shape[-3], 1) - self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) + # TODO: To have this check, we will need at least 3 layers. Do we really need it? + # self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1) def test_head_pruning_save_load_from_config_init(self): @@ -1087,7 +1089,8 @@ class ModelTesterMixin: attentions = outputs[-1] self.assertEqual(attentions[0].shape[-3], 1) - self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) + # TODO: To have this check, we will need at least 3 layers. Do we really need it? + # self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1) def test_head_pruning_integration(self): @@ -1106,7 +1109,7 @@ class ModelTesterMixin: inputs_dict["output_attentions"] = True config.output_hidden_states = False - heads_to_prune = {0: [0], 1: [1, 2]} + heads_to_prune = {1: [1, 2]} config.pruned_heads = heads_to_prune model = model_class(config=config) @@ -1117,10 +1120,8 @@ class ModelTesterMixin: outputs = model(**self._prepare_for_class(inputs_dict, model_class)) attentions = outputs[-1] - self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1) + self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 0) self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2) - self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads) - self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads) with tempfile.TemporaryDirectory() as temp_dir_name: model.save_pretrained(temp_dir_name) @@ -1131,12 +1132,10 @@ class ModelTesterMixin: outputs = model(**self._prepare_for_class(inputs_dict, model_class)) attentions = outputs[-1] - self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1) + self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 0) self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2) - self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads) - self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads) - heads_to_prune = {0: [0], 2: [1, 2]} + heads_to_prune = {0: [0], 1: [1, 2]} model.prune_heads(heads_to_prune) with torch.no_grad(): @@ -1145,10 +1144,8 @@ class ModelTesterMixin: self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1) self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2) - self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2) - self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads) - self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]}) + self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2]}) def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class):