From cc24b0378e6062895a03a077860be28d290c1d07 Mon Sep 17 00:00:00 2001 From: Pavel Iakubovskii Date: Wed, 16 Jul 2025 13:50:35 +0100 Subject: [PATCH] Better typing for model.config (#39132) * Apply to all models config annotation * Update modular to preserve order * Apply modular * fix define docstring * fix dinov2 consistency (docs<->modular) * fix InstructBlipVideoForConditionalGeneration docs<->modular consistency * fixup * remove duplicate code * Delete config_class attribute from the modeling code * Add config_class attribute in base model * Update init sub class * Deprecated models update * Update new models * Fix remote code BC issue * fixup * fixing more corner cases * fix new models * add test * modular docs update * fix comment a bit * fix for py3.9 --- src/transformers/generation/watermarking.py | 2 +- src/transformers/modeling_utils.py | 26 ++++++++++++- .../models/aimv2/modeling_aimv2.py | 6 +-- .../models/aimv2/modular_aimv2.py | 4 +- .../models/albert/modeling_albert.py | 4 +- .../models/align/modeling_align.py | 8 ++-- .../models/altclip/modeling_altclip.py | 10 ++--- .../models/arcee/modeling_arcee.py | 2 +- src/transformers/models/aria/modeling_aria.py | 4 +- src/transformers/models/aria/modular_aria.py | 4 +- .../modeling_audio_spectrogram_transformer.py | 2 +- .../models/autoformer/modeling_autoformer.py | 2 +- .../models/aya_vision/modeling_aya_vision.py | 2 +- .../models/bamba/modeling_bamba.py | 2 +- .../models/bamba/modular_bamba.py | 2 +- src/transformers/models/bark/modeling_bark.py | 12 +++--- src/transformers/models/bart/modeling_bart.py | 2 +- src/transformers/models/beit/modeling_beit.py | 2 +- src/transformers/models/bert/modeling_bert.py | 2 +- .../modeling_bert_generation.py | 2 +- .../models/big_bird/modeling_big_bird.py | 2 +- .../modeling_bigbird_pegasus.py | 2 +- .../models/biogpt/modeling_biogpt.py | 2 +- .../models/biogpt/modular_biogpt.py | 2 +- src/transformers/models/bit/modeling_bit.py | 2 +- .../models/bitnet/modeling_bitnet.py | 2 +- .../models/blenderbot/modeling_blenderbot.py | 2 +- .../modeling_blenderbot_small.py | 2 +- src/transformers/models/blip/modeling_blip.py | 12 +++--- .../models/blip/modeling_blip_text.py | 2 +- .../models/blip_2/modeling_blip_2.py | 8 ++-- .../models/bloom/modeling_bloom.py | 2 +- .../bridgetower/modeling_bridgetower.py | 6 +-- src/transformers/models/bros/modeling_bros.py | 2 +- .../models/camembert/modeling_camembert.py | 2 +- .../models/canine/modeling_canine.py | 2 +- .../models/chameleon/modeling_chameleon.py | 4 +- .../chinese_clip/modeling_chinese_clip.py | 8 ++-- src/transformers/models/clap/modeling_clap.py | 12 +++--- src/transformers/models/clip/modeling_clip.py | 12 +++--- .../models/clipseg/modeling_clipseg.py | 10 ++--- src/transformers/models/clvp/modeling_clvp.py | 4 +- .../models/codegen/modeling_codegen.py | 2 +- .../models/cohere/modeling_cohere.py | 2 +- .../models/cohere2/modeling_cohere2.py | 2 +- .../models/cohere2/modular_cohere2.py | 2 +- .../models/colpali/modeling_colpali.py | 2 +- .../models/colqwen2/modeling_colqwen2.py | 2 +- .../modeling_conditional_detr.py | 2 +- .../models/convbert/modeling_convbert.py | 2 +- .../models/convnext/modeling_convnext.py | 2 +- .../models/convnextv2/modeling_convnextv2.py | 2 +- .../models/cpmant/modeling_cpmant.py | 2 +- src/transformers/models/csm/modeling_csm.py | 4 +- src/transformers/models/csm/modular_csm.py | 4 +- src/transformers/models/ctrl/modeling_ctrl.py | 2 +- src/transformers/models/cvt/modeling_cvt.py | 2 +- .../models/d_fine/modeling_d_fine.py | 2 +- .../models/dab_detr/modeling_dab_detr.py | 2 +- src/transformers/models/dac/modeling_dac.py | 2 +- .../data2vec/modeling_data2vec_audio.py | 2 +- .../models/data2vec/modeling_data2vec_text.py | 2 +- .../data2vec/modeling_data2vec_vision.py | 2 +- .../models/data2vec/modular_data2vec_audio.py | 2 +- src/transformers/models/dbrx/modeling_dbrx.py | 2 +- .../models/deberta/modeling_deberta.py | 2 +- .../models/deberta_v2/modeling_deberta_v2.py | 2 +- .../modeling_decision_transformer.py | 4 +- .../deepseek_v2/modeling_deepseek_v2.py | 2 +- .../deepseek_v3/modeling_deepseek_v3.py | 2 +- .../modeling_deformable_detr.py | 2 +- src/transformers/models/deit/modeling_deit.py | 2 +- .../models/deprecated/deta/modeling_deta.py | 2 +- .../modeling_efficientformer.py | 2 +- .../deprecated/ernie_m/modeling_ernie_m.py | 2 +- .../modeling_gptsan_japanese.py | 2 +- .../graphormer/modeling_graphormer.py | 2 +- .../deprecated/jukebox/modeling_jukebox.py | 6 +-- .../models/deprecated/mctct/modeling_mctct.py | 2 +- .../models/deprecated/mega/modeling_mega.py | 2 +- .../models/deprecated/nat/modeling_nat.py | 2 +- .../models/deprecated/nezha/modeling_nezha.py | 2 +- .../open_llama/modeling_open_llama.py | 2 +- .../deprecated/qdqbert/modeling_qdqbert.py | 2 +- .../models/deprecated/realm/modeling_realm.py | 2 +- .../retribert/modeling_retribert.py | 2 +- .../modeling_speech_to_text_2.py | 2 +- .../modeling_trajectory_transformer.py | 2 +- .../transfo_xl/modeling_transfo_xl.py | 2 +- .../models/deprecated/tvlt/modeling_tvlt.py | 2 +- .../models/deprecated/van/modeling_van.py | 2 +- .../vit_hybrid/modeling_vit_hybrid.py | 2 +- .../xlm_prophetnet/modeling_xlm_prophetnet.py | 2 +- .../depth_anything/modeling_depth_anything.py | 2 +- .../models/depth_pro/modeling_depth_pro.py | 2 +- src/transformers/models/detr/modeling_detr.py | 2 +- src/transformers/models/dia/modeling_dia.py | 2 +- src/transformers/models/dia/modular_dia.py | 2 +- .../models/diffllama/modeling_diffllama.py | 2 +- .../models/dinat/modeling_dinat.py | 2 +- .../models/dinov2/modeling_dinov2.py | 2 +- .../modeling_dinov2_with_registers.py | 2 +- .../models/distilbert/modeling_distilbert.py | 2 +- src/transformers/models/doge/modeling_doge.py | 2 +- .../models/donut/modeling_donut_swin.py | 2 +- .../models/dots1/modeling_dots1.py | 2 +- src/transformers/models/dpr/modeling_dpr.py | 6 +-- src/transformers/models/dpt/modeling_dpt.py | 2 +- .../efficientnet/modeling_efficientnet.py | 2 +- .../models/electra/modeling_electra.py | 4 +- src/transformers/models/emu3/modeling_emu3.py | 6 +-- src/transformers/models/emu3/modular_emu3.py | 4 +- .../models/encodec/modeling_encodec.py | 2 +- .../modeling_encoder_decoder.py | 2 +- src/transformers/models/eomt/modeling_eomt.py | 2 +- src/transformers/models/eomt/modular_eomt.py | 2 +- .../models/ernie/modeling_ernie.py | 2 +- src/transformers/models/esm/modeling_esm.py | 2 +- .../models/falcon/modeling_falcon.py | 2 +- .../models/falcon_h1/modeling_falcon_h1.py | 2 +- .../models/falcon_h1/modular_falcon_h1.py | 2 +- .../falcon_mamba/modeling_falcon_mamba.py | 2 +- .../modeling_fastspeech2_conformer.py | 6 +-- .../models/flaubert/modeling_flaubert.py | 2 +- .../models/flava/modeling_flava.py | 12 +++--- src/transformers/models/fnet/modeling_fnet.py | 2 +- .../models/focalnet/modeling_focalnet.py | 2 +- src/transformers/models/fsmt/modeling_fsmt.py | 2 +- .../models/funnel/modeling_funnel.py | 2 +- src/transformers/models/fuyu/modeling_fuyu.py | 2 +- .../models/gemma/modeling_gemma.py | 2 +- .../models/gemma2/modeling_gemma2.py | 2 +- .../models/gemma3/modeling_gemma3.py | 6 +-- .../models/gemma3/modular_gemma3.py | 4 +- .../models/gemma3n/modeling_gemma3n.py | 8 ++-- .../models/gemma3n/modular_gemma3n.py | 6 +-- src/transformers/models/git/modeling_git.py | 4 +- src/transformers/models/glm/modeling_glm.py | 2 +- src/transformers/models/glm4/modeling_glm4.py | 2 +- .../models/glm4v/modeling_glm4v.py | 8 ++-- .../models/glm4v/modular_glm4v.py | 2 +- .../models/glm4v/processing_glm4v.py | 4 +- src/transformers/models/glpn/modeling_glpn.py | 2 +- .../models/got_ocr2/modeling_got_ocr2.py | 2 +- src/transformers/models/gpt2/modeling_gpt2.py | 2 +- .../gpt_bigcode/modeling_gpt_bigcode.py | 2 +- .../models/gpt_neo/modeling_gpt_neo.py | 2 +- .../models/gpt_neox/modeling_gpt_neox.py | 2 +- .../modeling_gpt_neox_japanese.py | 2 +- src/transformers/models/gptj/modeling_gptj.py | 2 +- .../models/granite/modeling_granite.py | 2 +- .../granite_speech/modeling_granite_speech.py | 2 +- .../models/granitemoe/modeling_granitemoe.py | 2 +- .../modeling_granitemoehybrid.py | 2 +- .../modular_granitemoehybrid.py | 2 +- .../modeling_granitemoeshared.py | 2 +- .../modular_granitemoeshared.py | 2 +- .../grounding_dino/modeling_grounding_dino.py | 2 +- .../models/groupvit/modeling_groupvit.py | 8 ++-- .../models/helium/modeling_helium.py | 2 +- .../models/hgnet_v2/modeling_hgnet_v2.py | 2 +- .../models/hgnet_v2/modular_hgnet_v2.py | 2 +- .../models/hiera/modeling_hiera.py | 2 +- .../models/hubert/modeling_hubert.py | 2 +- .../models/hubert/modular_hubert.py | 2 +- .../models/ibert/modeling_ibert.py | 2 +- .../models/idefics/modeling_idefics.py | 2 +- .../models/idefics2/modeling_idefics2.py | 6 +-- .../models/idefics3/modeling_idefics3.py | 4 +- .../models/ijepa/modeling_ijepa.py | 2 +- .../models/ijepa/modular_ijepa.py | 2 +- .../models/imagegpt/modeling_imagegpt.py | 2 +- .../models/informer/modeling_informer.py | 2 +- .../models/informer/modular_informer.py | 2 +- .../instructblip/modeling_instructblip.py | 6 +-- .../modeling_instructblipvideo.py | 6 +-- .../models/internvl/modeling_internvl.py | 4 +- .../models/internvl/modular_internvl.py | 2 +- .../models/jamba/modeling_jamba.py | 2 +- .../models/janus/modeling_janus.py | 6 +-- .../models/janus/modular_janus.py | 2 +- .../models/jetmoe/modeling_jetmoe.py | 2 +- .../models/kosmos2/modeling_kosmos2.py | 12 +++--- .../modeling_kyutai_speech_to_text.py | 2 +- .../models/layoutlm/modeling_layoutlm.py | 2 +- .../models/layoutlmv2/modeling_layoutlmv2.py | 2 +- .../models/layoutlmv3/modeling_layoutlmv3.py | 2 +- src/transformers/models/led/modeling_led.py | 2 +- .../models/levit/modeling_levit.py | 2 +- src/transformers/models/lfm2/modeling_lfm2.py | 2 +- .../models/lightglue/modeling_lightglue.py | 2 +- .../models/lightglue/modular_lightglue.py | 2 +- src/transformers/models/lilt/modeling_lilt.py | 2 +- .../models/llama/modeling_llama.py | 2 +- .../models/llama4/modeling_llama4.py | 10 ++--- .../models/llava/modeling_llava.py | 2 +- .../models/llava_next/modeling_llava_next.py | 2 +- .../modeling_llava_next_video.py | 2 +- .../modeling_llava_onevision.py | 2 +- .../models/longformer/modeling_longformer.py | 2 +- .../models/longt5/modeling_longt5.py | 2 +- src/transformers/models/luke/modeling_luke.py | 2 +- .../models/lxmert/modeling_lxmert.py | 2 +- .../models/m2m_100/modeling_m2m_100.py | 2 +- .../models/mamba/modeling_mamba.py | 2 +- .../models/mamba2/modeling_mamba2.py | 2 +- .../models/marian/modeling_marian.py | 2 +- .../models/markuplm/modeling_markuplm.py | 2 +- .../mask2former/modeling_mask2former.py | 2 +- .../models/maskformer/modeling_maskformer.py | 2 +- .../maskformer/modeling_maskformer_swin.py | 2 +- .../models/mbart/modeling_mbart.py | 2 +- .../megatron_bert/modeling_megatron_bert.py | 2 +- .../models/mgp_str/modeling_mgp_str.py | 4 +- src/transformers/models/mimi/modeling_mimi.py | 2 +- .../models/minimax/modeling_minimax.py | 2 +- .../models/mistral/modeling_mistral.py | 2 +- .../models/mistral3/modeling_mistral3.py | 2 +- .../models/mixtral/modeling_mixtral.py | 2 +- src/transformers/models/mlcd/modeling_mlcd.py | 4 +- src/transformers/models/mlcd/modular_mlcd.py | 2 +- .../models/mllama/modeling_mllama.py | 8 ++-- .../models/mobilebert/modeling_mobilebert.py | 2 +- .../mobilenet_v1/modeling_mobilenet_v1.py | 2 +- .../mobilenet_v2/modeling_mobilenet_v2.py | 2 +- .../models/mobilevit/modeling_mobilevit.py | 2 +- .../mobilevitv2/modeling_mobilevitv2.py | 2 +- .../models/modernbert/modeling_modernbert.py | 2 +- .../models/modernbert/modular_modernbert.py | 2 +- .../modeling_modernbert_decoder.py | 2 +- .../modular_modernbert_decoder.py | 2 +- .../models/moonshine/modeling_moonshine.py | 2 +- .../models/moonshine/modular_moonshine.py | 2 +- .../models/moshi/modeling_moshi.py | 6 +-- .../models/mpnet/modeling_mpnet.py | 2 +- src/transformers/models/mpt/modeling_mpt.py | 2 +- src/transformers/models/mra/modeling_mra.py | 2 +- src/transformers/models/mt5/modeling_mt5.py | 8 ++-- .../models/musicgen/modeling_musicgen.py | 4 +- .../modeling_musicgen_melody.py | 4 +- src/transformers/models/mvp/modeling_mvp.py | 2 +- .../models/nemotron/modeling_nemotron.py | 2 +- .../models/nllb_moe/modeling_nllb_moe.py | 2 +- .../nystromformer/modeling_nystromformer.py | 2 +- src/transformers/models/olmo/modeling_olmo.py | 2 +- .../models/olmo2/modeling_olmo2.py | 2 +- .../models/olmoe/modeling_olmoe.py | 2 +- .../omdet_turbo/modeling_omdet_turbo.py | 2 +- .../models/oneformer/modeling_oneformer.py | 2 +- .../models/openai/modeling_openai.py | 2 +- src/transformers/models/opt/modeling_opt.py | 2 +- .../models/owlv2/modeling_owlv2.py | 10 ++--- .../models/owlvit/modeling_owlvit.py | 10 ++--- .../models/paligemma/modeling_paligemma.py | 2 +- .../patchtsmixer/modeling_patchtsmixer.py | 2 +- .../models/patchtst/modeling_patchtst.py | 2 +- .../models/pegasus/modeling_pegasus.py | 2 +- .../models/pegasus_x/modeling_pegasus_x.py | 2 +- .../models/perceiver/modeling_perceiver.py | 2 +- .../perception_lm/modeling_perception_lm.py | 2 +- .../models/persimmon/modeling_persimmon.py | 2 +- src/transformers/models/phi/modeling_phi.py | 2 +- src/transformers/models/phi3/modeling_phi3.py | 2 +- .../modeling_phi4_multimodal.py | 8 ++-- .../modular_phi4_multimodal.py | 6 +-- .../models/phimoe/modeling_phimoe.py | 2 +- .../models/pix2struct/modeling_pix2struct.py | 8 ++-- .../models/pixtral/modeling_pixtral.py | 2 +- .../models/plbart/modeling_plbart.py | 2 +- .../models/plbart/modular_plbart.py | 2 +- .../models/poolformer/modeling_poolformer.py | 2 +- .../models/pop2piano/modeling_pop2piano.py | 2 +- .../modeling_prompt_depth_anything.py | 2 +- .../modular_prompt_depth_anything.py | 2 +- .../models/prophetnet/modeling_prophetnet.py | 2 +- src/transformers/models/pvt/modeling_pvt.py | 2 +- .../models/pvt_v2/modeling_pvt_v2.py | 2 +- .../models/qwen2/modeling_qwen2.py | 2 +- .../qwen2_5_omni/modeling_qwen2_5_omni.py | 22 +++++------ .../qwen2_5_omni/modular_qwen2_5_omni.py | 22 +++++------ .../models/qwen2_5_vl/modeling_qwen2_5_vl.py | 8 ++-- .../models/qwen2_5_vl/modular_qwen2_5_vl.py | 4 +- .../qwen2_audio/modeling_qwen2_audio.py | 4 +- .../models/qwen2_moe/modeling_qwen2_moe.py | 2 +- .../models/qwen2_vl/modeling_qwen2_vl.py | 6 +-- .../models/qwen3/modeling_qwen3.py | 2 +- .../models/qwen3_moe/modeling_qwen3_moe.py | 2 +- src/transformers/models/rag/modeling_rag.py | 2 +- .../modeling_recurrent_gemma.py | 2 +- .../models/reformer/modeling_reformer.py | 2 +- .../models/regnet/modeling_regnet.py | 2 +- .../models/rembert/modeling_rembert.py | 2 +- .../models/resnet/modeling_resnet.py | 2 +- .../models/roberta/modeling_roberta.py | 2 +- .../modeling_roberta_prelayernorm.py | 2 +- .../models/roc_bert/modeling_roc_bert.py | 2 +- .../models/roformer/modeling_roformer.py | 2 +- .../models/rt_detr/modeling_rt_detr.py | 2 +- .../models/rt_detr/modeling_rt_detr_resnet.py | 2 +- .../models/rt_detr_v2/modeling_rt_detr_v2.py | 2 +- src/transformers/models/rwkv/modeling_rwkv.py | 2 +- src/transformers/models/sam/modeling_sam.py | 4 +- .../models/sam_hq/modeling_sam_hq.py | 4 +- .../seamless_m4t/modeling_seamless_m4t.py | 4 +- .../modeling_seamless_m4t_v2.py | 4 +- .../models/segformer/modeling_segformer.py | 2 +- .../models/seggpt/modeling_seggpt.py | 2 +- src/transformers/models/sew/modeling_sew.py | 2 +- src/transformers/models/sew/modular_sew.py | 2 +- .../models/sew_d/modeling_sew_d.py | 2 +- .../shieldgemma2/modeling_shieldgemma2.py | 2 +- .../models/siglip/modeling_siglip.py | 8 ++-- .../models/siglip2/modeling_siglip2.py | 8 ++-- .../models/smollm3/modeling_smollm3.py | 2 +- .../models/smolvlm/modeling_smolvlm.py | 4 +- .../modeling_speech_encoder_decoder.py | 2 +- .../speech_to_text/modeling_speech_to_text.py | 2 +- .../models/speecht5/modeling_speecht5.py | 4 +- .../models/splinter/modeling_splinter.py | 2 +- .../squeezebert/modeling_squeezebert.py | 2 +- .../models/stablelm/modeling_stablelm.py | 2 +- .../models/starcoder2/modeling_starcoder2.py | 2 +- .../models/superglue/modeling_superglue.py | 2 +- .../models/superpoint/modeling_superpoint.py | 2 +- .../swiftformer/modeling_swiftformer.py | 2 +- src/transformers/models/swin/modeling_swin.py | 2 +- .../models/swin2sr/modeling_swin2sr.py | 2 +- .../models/swinv2/modeling_swinv2.py | 2 +- .../modeling_switch_transformers.py | 2 +- src/transformers/models/t5/modeling_t5.py | 2 +- .../models/t5gemma/modeling_t5gemma.py | 2 +- .../models/t5gemma/modular_t5gemma.py | 2 +- .../modeling_table_transformer.py | 2 +- .../models/tapas/modeling_tapas.py | 4 +- .../models/textnet/modeling_textnet.py | 2 +- .../modeling_time_series_transformer.py | 2 +- .../models/timesfm/modeling_timesfm.py | 2 +- .../models/timesfm/modular_timesfm.py | 2 +- .../timesformer/modeling_timesformer.py | 2 +- .../timm_backbone/modeling_timm_backbone.py | 2 +- .../timm_wrapper/modeling_timm_wrapper.py | 2 +- .../models/trocr/modeling_trocr.py | 2 +- src/transformers/models/tvp/modeling_tvp.py | 2 +- src/transformers/models/udop/modeling_udop.py | 2 +- src/transformers/models/umt5/modeling_umt5.py | 4 +- .../models/unispeech/modeling_unispeech.py | 2 +- .../models/unispeech/modular_unispeech.py | 2 +- .../unispeech_sat/modeling_unispeech_sat.py | 2 +- .../unispeech_sat/modular_unispeech_sat.py | 2 +- .../models/univnet/modeling_univnet.py | 2 +- .../models/upernet/modeling_upernet.py | 2 +- .../video_llava/modeling_video_llava.py | 2 +- .../models/videomae/modeling_videomae.py | 2 +- src/transformers/models/vilt/modeling_vilt.py | 2 +- .../models/vipllava/modeling_vipllava.py | 2 +- .../modeling_vision_encoder_decoder.py | 2 +- .../modeling_vision_text_dual_encoder.py | 2 +- .../visual_bert/modeling_visual_bert.py | 2 +- src/transformers/models/vit/modeling_vit.py | 2 +- .../models/vit_mae/modeling_vit_mae.py | 2 +- .../models/vit_msn/modeling_vit_msn.py | 2 +- .../models/vitdet/modeling_vitdet.py | 2 +- .../models/vitmatte/modeling_vitmatte.py | 2 +- .../models/vitpose/modeling_vitpose.py | 2 +- .../modeling_vitpose_backbone.py | 2 +- src/transformers/models/vits/modeling_vits.py | 2 +- .../models/vivit/modeling_vivit.py | 2 +- .../models/vjepa2/modeling_vjepa2.py | 2 +- .../models/wav2vec2/modeling_wav2vec2.py | 2 +- .../wav2vec2_bert/modeling_wav2vec2_bert.py | 2 +- .../wav2vec2_bert/modular_wav2vec2_bert.py | 2 +- .../modeling_wav2vec2_conformer.py | 2 +- .../modular_wav2vec2_conformer.py | 2 +- .../models/wavlm/modeling_wavlm.py | 2 +- .../models/wavlm/modular_wavlm.py | 2 +- .../models/whisper/modeling_whisper.py | 2 +- .../models/x_clip/modeling_x_clip.py | 8 ++-- src/transformers/models/xglm/modeling_xglm.py | 2 +- src/transformers/models/xlm/modeling_xlm.py | 2 +- .../xlm_roberta/modeling_xlm_roberta.py | 2 +- .../xlm_roberta_xl/modeling_xlm_roberta_xl.py | 2 +- .../models/xlnet/modeling_xlnet.py | 2 +- src/transformers/models/xmod/modeling_xmod.py | 2 +- .../models/yolos/modeling_yolos.py | 2 +- src/transformers/models/yoso/modeling_yoso.py | 2 +- .../models/zamba/modeling_zamba.py | 2 +- .../models/zamba2/modeling_zamba2.py | 2 +- .../models/zamba2/modular_zamba2.py | 2 +- .../models/zoedepth/modeling_zoedepth.py | 2 +- tests/utils/test_modeling_utils.py | 31 +++++++++++++++ utils/modular_model_converter.py | 38 +++++++------------ 391 files changed, 630 insertions(+), 585 deletions(-) diff --git a/src/transformers/generation/watermarking.py b/src/transformers/generation/watermarking.py index 16993a949a..9ce2539851 100644 --- a/src/transformers/generation/watermarking.py +++ b/src/transformers/generation/watermarking.py @@ -375,7 +375,7 @@ class BayesianDetectorModel(PreTrainedModel): configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. """ - config_class = BayesianDetectorConfig + config: BayesianDetectorConfig base_model_prefix = "model" def __init__(self, config): diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 11fa589ff8..6fadc8adf1 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -33,7 +33,7 @@ from contextlib import contextmanager from enum import Enum from functools import partial, wraps from threading import Thread -from typing import Any, Callable, Optional, TypeVar, Union +from typing import Any, Callable, Optional, TypeVar, Union, get_type_hints from zipfile import is_zipfile import torch @@ -2060,6 +2060,30 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi """ return "pt" + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + # For BC we keep the original `config_class` definition in case + # there is a `config_class` attribute (e.g. remote code models), + # otherwise we derive it from the annotated `config` attribute. + + # defined in this particular subclass + child_annotation = cls.__dict__.get("__annotations__", {}).get("config", None) + child_attribute = cls.__dict__.get("config_class", None) + + # defined in the class (this subclass or any parent class) + full_annotation = get_type_hints(cls).get("config", None) + full_attribute = cls.config_class + + # priority (child class_config -> child annotation -> global class_config -> global annotation) + if child_attribute is not None: + cls.config_class = child_attribute + elif child_annotation is not None: + cls.config_class = child_annotation + elif full_attribute is not None: + cls.config_class = full_attribute + elif full_annotation is not None: + cls.config_class = full_annotation + def __init__(self, config: PretrainedConfig, *inputs, **kwargs): super().__init__() if not isinstance(config, PretrainedConfig): diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py index c0d9f0990c..7b124a64c9 100644 --- a/src/transformers/models/aimv2/modeling_aimv2.py +++ b/src/transformers/models/aimv2/modeling_aimv2.py @@ -434,7 +434,7 @@ class Aimv2PreTrainedModel(PreTrainedModel): models. The model is only intended for inference and doesn't support finetuning. """ - config_class = Aimv2Config + config: Aimv2Config base_model_prefix = "aimv2" supports_gradient_checkpointing = True _no_split_modules = [ @@ -474,8 +474,8 @@ class Aimv2PreTrainedModel(PreTrainedModel): """ ) class Aimv2VisionModel(Aimv2PreTrainedModel): + config: Aimv2VisionConfig main_input_name = "pixel_values" - config_class = Aimv2VisionConfig def __init__(self, config: Aimv2VisionConfig): super().__init__(config) @@ -640,7 +640,7 @@ def _get_vector_norm(tensor: torch.Tensor) -> torch.Tensor: @auto_docstring class Aimv2Model(Aimv2PreTrainedModel): - config_class = Aimv2Config + config: Aimv2Config _no_split_modules = ["Aimv2TextEmbeddings", "Aimv2EncoderLayer", "Aimv2VisionEmbeddings"] def __init__(self, config: Aimv2Config): diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py index 703c42e308..7c83bf4e2d 100644 --- a/src/transformers/models/aimv2/modular_aimv2.py +++ b/src/transformers/models/aimv2/modular_aimv2.py @@ -431,7 +431,7 @@ class Aimv2PreTrainedModel(PreTrainedModel): models. The model is only intended for inference and doesn't support finetuning. """ - config_class = Aimv2Config + config: Aimv2Config base_model_prefix = "aimv2" supports_gradient_checkpointing = True _no_split_modules = [ @@ -471,8 +471,8 @@ class Aimv2PreTrainedModel(PreTrainedModel): """ ) class Aimv2VisionModel(Aimv2PreTrainedModel): + config: Aimv2VisionConfig main_input_name = "pixel_values" - config_class = Aimv2VisionConfig def __init__(self, config: Aimv2VisionConfig): super().__init__(config) diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py index 005665d324..a7de99ac1d 100755 --- a/src/transformers/models/albert/modeling_albert.py +++ b/src/transformers/models/albert/modeling_albert.py @@ -555,7 +555,7 @@ class AlbertTransformer(nn.Module): @auto_docstring class AlbertPreTrainedModel(PreTrainedModel): - config_class = AlbertConfig + config: AlbertConfig load_tf_weights = load_tf_weights_in_albert base_model_prefix = "albert" _supports_sdpa = True @@ -606,7 +606,7 @@ class AlbertForPreTrainingOutput(ModelOutput): @auto_docstring class AlbertModel(AlbertPreTrainedModel): - config_class = AlbertConfig + config: AlbertConfig base_model_prefix = "albert" def __init__(self, config: AlbertConfig, add_pooling_layer: bool = True): diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py index da015bf7dd..4bc5f442cb 100644 --- a/src/transformers/models/align/modeling_align.py +++ b/src/transformers/models/align/modeling_align.py @@ -881,7 +881,7 @@ class AlignTextPooler(nn.Module): @auto_docstring class AlignPreTrainedModel(PreTrainedModel): - config_class = AlignConfig + config: AlignConfig base_model_prefix = "align" supports_gradient_checkpointing = True @@ -910,7 +910,7 @@ class AlignPreTrainedModel(PreTrainedModel): """ ) class AlignTextModel(AlignPreTrainedModel): - config_class = AlignTextConfig + config: AlignTextConfig _no_split_modules = ["AlignTextEmbeddings"] def __init__(self, config: AlignTextConfig, add_pooling_layer: bool = True): @@ -1038,7 +1038,7 @@ class AlignTextModel(AlignPreTrainedModel): """ ) class AlignVisionModel(AlignPreTrainedModel): - config_class = AlignVisionConfig + config: AlignVisionConfig main_input_name = "pixel_values" supports_gradient_checkpointing = False @@ -1119,7 +1119,7 @@ class AlignVisionModel(AlignPreTrainedModel): @auto_docstring class AlignModel(AlignPreTrainedModel): - config_class = AlignConfig + config: AlignConfig def __init__(self, config: AlignConfig): super().__init__(config) diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index c770dd5adc..9f44323fe4 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -837,7 +837,7 @@ class AltCLIPVisionEmbeddings(nn.Module): @auto_docstring class AltCLIPPreTrainedModel(PreTrainedModel): - config_class = AltCLIPConfig + config: AltCLIPConfig base_model_prefix = "altclip" supports_gradient_checkpointing = True _no_split_module = [] @@ -941,7 +941,7 @@ class AltCLIPVisionTransformer(nn.Module): class AltCLIPVisionModel(AltCLIPPreTrainedModel): - config_class = AltCLIPVisionConfig + config: AltCLIPVisionConfig main_input_name = "pixel_values" def __init__(self, config: AltCLIPVisionConfig): @@ -1003,7 +1003,7 @@ class AltCLIPVisionModel(AltCLIPPreTrainedModel): """ ) class AltRobertaModel(AltCLIPPreTrainedModel): - config_class = AltCLIPTextConfig + config: AltCLIPTextConfig # Copied from transformers.models.clap.modeling_clap.ClapTextModel.__init__ with ClapText->AltRoberta def __init__(self, config, add_pooling_layer=True): @@ -1121,7 +1121,7 @@ class AltRobertaModel(AltCLIPPreTrainedModel): class AltCLIPTextModel(AltCLIPPreTrainedModel): - config_class = AltCLIPTextConfig + config: AltCLIPTextConfig def __init__(self, config): super().__init__(config) @@ -1208,7 +1208,7 @@ class AltCLIPTextModel(AltCLIPPreTrainedModel): class AltCLIPModel(AltCLIPPreTrainedModel): - config_class = AltCLIPConfig + config: AltCLIPConfig def __init__(self, config: AltCLIPConfig): super().__init__(config) diff --git a/src/transformers/models/arcee/modeling_arcee.py b/src/transformers/models/arcee/modeling_arcee.py index 448ef08632..99a763b1e0 100644 --- a/src/transformers/models/arcee/modeling_arcee.py +++ b/src/transformers/models/arcee/modeling_arcee.py @@ -308,7 +308,7 @@ class ArceeDecoderLayer(GradientCheckpointingLayer): @auto_docstring class ArceePreTrainedModel(PreTrainedModel): - config_class = ArceeConfig + config: ArceeConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["ArceeDecoderLayer"] diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py index 99c6030125..24e741f879 100644 --- a/src/transformers/models/aria/modeling_aria.py +++ b/src/transformers/models/aria/modeling_aria.py @@ -624,7 +624,7 @@ class AriaTextDecoderLayer(GradientCheckpointingLayer): @auto_docstring class AriaTextPreTrainedModel(PreTrainedModel): - config_class = AriaTextConfig + config: AriaTextConfig base_model_prefix = "model" _no_split_modules = ["AriaTextDecoderLayer", "AriaGroupedExpertsGemm"] supports_gradient_checkpointing = True @@ -656,7 +656,7 @@ class AriaTextPreTrainedModel(PreTrainedModel): @auto_docstring class AriaPreTrainedModel(PreTrainedModel): - config_class = AriaConfig + config: AriaConfig base_model_prefix = "" supports_gradient_checkpointing = True _no_split_modules = ["AriaDecoderLayer"] diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py index aa9aea69f8..95e3bad057 100644 --- a/src/transformers/models/aria/modular_aria.py +++ b/src/transformers/models/aria/modular_aria.py @@ -1279,7 +1279,7 @@ class AriaTextDecoderLayer(LlamaDecoderLayer): @auto_docstring class AriaTextPreTrainedModel(PreTrainedModel): - config_class = AriaTextConfig + config: AriaTextConfig base_model_prefix = "model" _no_split_modules = ["AriaTextDecoderLayer", "AriaGroupedExpertsGemm"] supports_gradient_checkpointing = True @@ -1310,7 +1310,7 @@ class AriaTextPreTrainedModel(PreTrainedModel): class AriaPreTrainedModel(LlamaPreTrainedModel): - config_class = AriaConfig + config: AriaConfig base_model_prefix = "" _supports_static_cache = False # MoE models don't work with torch.compile (dynamic slicing) _supports_attention_backend = True diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py index 32a2c8bad1..d6ac5f15db 100644 --- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py @@ -381,7 +381,7 @@ class ASTEncoder(nn.Module): @auto_docstring class ASTPreTrainedModel(PreTrainedModel): - config_class = ASTConfig + config: ASTConfig base_model_prefix = "audio_spectrogram_transformer" main_input_name = "input_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/autoformer/modeling_autoformer.py b/src/transformers/models/autoformer/modeling_autoformer.py index 974d2a5e4d..cdf7a7db05 100644 --- a/src/transformers/models/autoformer/modeling_autoformer.py +++ b/src/transformers/models/autoformer/modeling_autoformer.py @@ -847,7 +847,7 @@ class AutoformerDecoderLayer(GradientCheckpointingLayer): @auto_docstring class AutoformerPreTrainedModel(PreTrainedModel): - config_class = AutoformerConfig + config: AutoformerConfig base_model_prefix = "model" main_input_name = "past_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/aya_vision/modeling_aya_vision.py b/src/transformers/models/aya_vision/modeling_aya_vision.py index da420c8211..5692819b3e 100644 --- a/src/transformers/models/aya_vision/modeling_aya_vision.py +++ b/src/transformers/models/aya_vision/modeling_aya_vision.py @@ -89,7 +89,7 @@ class AyaVisionMultiModalProjector(nn.Module): @auto_docstring class AyaVisionPreTrainedModel(PreTrainedModel): - config_class = AyaVisionConfig + config: AyaVisionConfig base_model_prefix = "" supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/bamba/modeling_bamba.py b/src/transformers/models/bamba/modeling_bamba.py index 3e63239970..a0cfecc4e3 100644 --- a/src/transformers/models/bamba/modeling_bamba.py +++ b/src/transformers/models/bamba/modeling_bamba.py @@ -1034,7 +1034,7 @@ class BambaDecoderLayer(GradientCheckpointingLayer): @auto_docstring class BambaPreTrainedModel(PreTrainedModel): - config_class = BambaConfig + config: BambaConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["BambaDecoderLayer"] diff --git a/src/transformers/models/bamba/modular_bamba.py b/src/transformers/models/bamba/modular_bamba.py index 937b41113b..e1a6eed50d 100644 --- a/src/transformers/models/bamba/modular_bamba.py +++ b/src/transformers/models/bamba/modular_bamba.py @@ -805,7 +805,7 @@ class BambaDecoderLayer(JambaAttentionDecoderLayer): @auto_docstring class BambaPreTrainedModel(PreTrainedModel): - config_class = BambaConfig + config: BambaConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["BambaDecoderLayer"] diff --git a/src/transformers/models/bark/modeling_bark.py b/src/transformers/models/bark/modeling_bark.py index 6f01ccd0d2..5dd03770ec 100644 --- a/src/transformers/models/bark/modeling_bark.py +++ b/src/transformers/models/bark/modeling_bark.py @@ -328,7 +328,7 @@ class BarkBlock(GradientCheckpointingLayer): @auto_docstring class BarkPreTrainedModel(PreTrainedModel): - config_class = BarkConfig + config: BarkConfig supports_gradient_checkpointing = False _supports_flash_attn = True @@ -374,7 +374,7 @@ class BarkPreTrainedModel(PreTrainedModel): # GPT2-like autoregressive model class BarkCausalModel(BarkPreTrainedModel, GenerationMixin): - config_class = BarkSubModelConfig + config: BarkSubModelConfig def __init__(self, config): super().__init__(config) @@ -627,7 +627,7 @@ class BarkCausalModel(BarkPreTrainedModel, GenerationMixin): ) class BarkSemanticModel(BarkCausalModel): base_model_prefix = "semantic" - config_class = BarkSemanticConfig + config: BarkSemanticConfig def generate( self, @@ -738,7 +738,7 @@ class BarkSemanticModel(BarkCausalModel): ) class BarkCoarseModel(BarkCausalModel): base_model_prefix = "coarse_acoustics" - config_class = BarkCoarseConfig + config: BarkCoarseConfig def preprocess_histories( self, @@ -959,7 +959,7 @@ class BarkCoarseModel(BarkCausalModel): ) class BarkFineModel(BarkPreTrainedModel): base_model_prefix = "fine_acoustics" - config_class = BarkFineConfig + config: BarkFineConfig main_input_name = "codebook_idx" def __init__(self, config): @@ -1393,7 +1393,7 @@ class BarkFineModel(BarkPreTrainedModel): """ ) class BarkModel(BarkPreTrainedModel): - config_class = BarkConfig + config: BarkConfig def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py index 77665f5313..1e43e754bb 100755 --- a/src/transformers/models/bart/modeling_bart.py +++ b/src/transformers/models/bart/modeling_bart.py @@ -484,7 +484,7 @@ class BartClassificationHead(nn.Module): @auto_docstring class BartPreTrainedModel(PreTrainedModel): - config_class = BartConfig + config: BartConfig base_model_prefix = "model" supports_gradient_checkpointing = True _keys_to_ignore_on_load_unexpected = ["encoder.version", "decoder.version"] diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py index 9c964467e5..9138e7b840 100755 --- a/src/transformers/models/beit/modeling_beit.py +++ b/src/transformers/models/beit/modeling_beit.py @@ -722,7 +722,7 @@ class BeitEncoder(nn.Module): @auto_docstring class BeitPreTrainedModel(PreTrainedModel): - config_class = BeitConfig + config: BeitConfig base_model_prefix = "beit" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py index 48832a1cf0..c92f02a5f8 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py @@ -805,7 +805,7 @@ class BertPreTrainingHeads(nn.Module): @auto_docstring class BertPreTrainedModel(PreTrainedModel): - config_class = BertConfig + config: BertConfig load_tf_weights = load_tf_weights_in_bert base_model_prefix = "bert" supports_gradient_checkpointing = True diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py index 9dd0f39311..0a4d56c3f3 100755 --- a/src/transformers/models/bert_generation/modeling_bert_generation.py +++ b/src/transformers/models/bert_generation/modeling_bert_generation.py @@ -572,7 +572,7 @@ class BertGenerationEmbeddings(nn.Module): @auto_docstring class BertGenerationPreTrainedModel(PreTrainedModel): - config_class = BertGenerationConfig + config: BertGenerationConfig base_model_prefix = "bert" supports_gradient_checkpointing = True diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py index 3058bdc94f..affbc3335e 100755 --- a/src/transformers/models/big_bird/modeling_big_bird.py +++ b/src/transformers/models/big_bird/modeling_big_bird.py @@ -1709,7 +1709,7 @@ class BigBirdPreTrainingHeads(nn.Module): @auto_docstring class BigBirdPreTrainedModel(PreTrainedModel): - config_class = BigBirdConfig + config: BigBirdConfig load_tf_weights = load_tf_weights_in_big_bird base_model_prefix = "bert" supports_gradient_checkpointing = True diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index 2466400b82..2220efc887 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -1559,7 +1559,7 @@ class BigBirdPegasusClassificationHead(nn.Module): @auto_docstring class BigBirdPegasusPreTrainedModel(PreTrainedModel): - config_class = BigBirdPegasusConfig + config: BigBirdPegasusConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["BigBirdPegasusEncoderLayer", "BigBirdPegasusDecoderLayer"] diff --git a/src/transformers/models/biogpt/modeling_biogpt.py b/src/transformers/models/biogpt/modeling_biogpt.py index 543c6cba5c..c1575b0c04 100755 --- a/src/transformers/models/biogpt/modeling_biogpt.py +++ b/src/transformers/models/biogpt/modeling_biogpt.py @@ -340,7 +340,7 @@ class BioGptDecoderLayer(GradientCheckpointingLayer): @auto_docstring class BioGptPreTrainedModel(PreTrainedModel): - config_class = BioGptConfig + config: BioGptConfig base_model_prefix = "biogpt" supports_gradient_checkpointing = True _supports_flash_attn = True diff --git a/src/transformers/models/biogpt/modular_biogpt.py b/src/transformers/models/biogpt/modular_biogpt.py index 0994ff6469..44ccac314d 100644 --- a/src/transformers/models/biogpt/modular_biogpt.py +++ b/src/transformers/models/biogpt/modular_biogpt.py @@ -165,7 +165,7 @@ class BioGptDecoderLayer(BartDecoderLayer): @auto_docstring class BioGptPreTrainedModel(PreTrainedModel): - config_class = BioGptConfig + config: BioGptConfig base_model_prefix = "biogpt" supports_gradient_checkpointing = True _supports_flash_attn = True diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py index e6e5f68a98..140a2e7b52 100644 --- a/src/transformers/models/bit/modeling_bit.py +++ b/src/transformers/models/bit/modeling_bit.py @@ -631,7 +631,7 @@ class BitEncoder(nn.Module): @auto_docstring class BitPreTrainedModel(PreTrainedModel): - config_class = BitConfig + config: BitConfig base_model_prefix = "bit" main_input_name = "pixel_values" _no_split_modules = ["BitEmbeddings"] diff --git a/src/transformers/models/bitnet/modeling_bitnet.py b/src/transformers/models/bitnet/modeling_bitnet.py index a9fb4a30f0..66cf5d02f4 100644 --- a/src/transformers/models/bitnet/modeling_bitnet.py +++ b/src/transformers/models/bitnet/modeling_bitnet.py @@ -303,7 +303,7 @@ class BitNetRotaryEmbedding(nn.Module): @auto_docstring class BitNetPreTrainedModel(PreTrainedModel): - config_class = BitNetConfig + config: BitNetConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["BitNetDecoderLayer"] diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py index a94a31a04b..c4bb3b6e19 100755 --- a/src/transformers/models/blenderbot/modeling_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_blenderbot.py @@ -452,7 +452,7 @@ class BlenderbotDecoderLayer(GradientCheckpointingLayer): @auto_docstring class BlenderbotPreTrainedModel(PreTrainedModel): - config_class = BlenderbotConfig + config: BlenderbotConfig base_model_prefix = "model" supports_gradient_checkpointing = True _supports_flash_attn = True diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py index c6abb96300..74e5d0767a 100755 --- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py @@ -445,7 +445,7 @@ class BlenderbotSmallDecoderLayer(GradientCheckpointingLayer): @auto_docstring class BlenderbotSmallPreTrainedModel(PreTrainedModel): - config_class = BlenderbotSmallConfig + config: BlenderbotSmallConfig base_model_prefix = "model" supports_gradient_checkpointing = True _supports_flash_attn = True diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py index c799c2bf96..3644c71cc0 100644 --- a/src/transformers/models/blip/modeling_blip.py +++ b/src/transformers/models/blip/modeling_blip.py @@ -434,7 +434,7 @@ class BlipEncoderLayer(GradientCheckpointingLayer): @auto_docstring class BlipPreTrainedModel(PreTrainedModel): - config_class = BlipConfig + config: BlipConfig base_model_prefix = "blip" supports_gradient_checkpointing = True _no_split_modules = ["BlipEncoderLayer", "BlipTextEmbeddings"] @@ -551,7 +551,7 @@ class BlipEncoder(nn.Module): class BlipVisionModel(BlipPreTrainedModel): main_input_name = "pixel_values" - config_class = BlipVisionConfig + config: BlipVisionConfig def __init__(self, config: BlipVisionConfig): super().__init__(config) @@ -617,7 +617,7 @@ class BlipVisionModel(BlipPreTrainedModel): """ ) class BlipModel(BlipPreTrainedModel): - config_class = BlipConfig + config: BlipConfig def __init__(self, config: BlipConfig): super().__init__(config) @@ -902,7 +902,7 @@ class BlipModel(BlipPreTrainedModel): """ ) class BlipForConditionalGeneration(BlipPreTrainedModel, GenerationMixin): - config_class = BlipConfig + config: BlipConfig _tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"] main_input_name = "pixel_values" @@ -1080,7 +1080,7 @@ class BlipForConditionalGeneration(BlipPreTrainedModel, GenerationMixin): """ ) class BlipForQuestionAnswering(BlipPreTrainedModel, GenerationMixin): - config_class = BlipConfig + config: BlipConfig _tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"] def __init__(self, config: BlipConfig): @@ -1310,7 +1310,7 @@ class BlipForQuestionAnswering(BlipPreTrainedModel, GenerationMixin): """ ) class BlipForImageTextRetrieval(BlipPreTrainedModel): - config_class = BlipConfig + config: BlipConfig def __init__(self, config: BlipConfig): super().__init__(config) diff --git a/src/transformers/models/blip/modeling_blip_text.py b/src/transformers/models/blip/modeling_blip_text.py index 821bd783c6..0c6e8777fd 100644 --- a/src/transformers/models/blip/modeling_blip_text.py +++ b/src/transformers/models/blip/modeling_blip_text.py @@ -575,7 +575,7 @@ class BlipTextPreTrainedModel(PreTrainedModel): models. """ - config_class = BlipTextConfig + config: BlipTextConfig base_model_prefix = "bert" _no_split_modules = [] diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index 8235767b7e..4c7a52e6fb 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -405,7 +405,7 @@ class Blip2EncoderLayer(GradientCheckpointingLayer): @auto_docstring class Blip2PreTrainedModel(PreTrainedModel): - config_class = Blip2Config + config: Blip2Config base_model_prefix = "blip" supports_gradient_checkpointing = True _supports_attention_backend = True @@ -536,7 +536,7 @@ class Blip2Encoder(nn.Module): # Copied from transformers.models.blip.modeling_blip.BlipVisionModel with Blip->Blip2, BLIP->BLIP_2 class Blip2VisionModel(Blip2PreTrainedModel): main_input_name = "pixel_values" - config_class = Blip2VisionConfig + config: Blip2VisionConfig def __init__(self, config: Blip2VisionConfig): super().__init__(config) @@ -1234,7 +1234,7 @@ class Blip2QFormerModel(Blip2PreTrainedModel): """ ) class Blip2Model(Blip2PreTrainedModel): - config_class = Blip2Config + config: Blip2Config main_input_name = "pixel_values" _keep_in_fp32_modules = ["query_tokens", "qformer"] _supports_flash_attn = False # because self.qformer does not support FA2 @@ -1828,7 +1828,7 @@ class Blip2VisionModelWithProjection(Blip2PreTrainedModel): """ ) class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin): - config_class = Blip2Config + config: Blip2Config main_input_name = "pixel_values" _supports_static_cache = True diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py index 242061eb4e..7da3e9de9e 100644 --- a/src/transformers/models/bloom/modeling_bloom.py +++ b/src/transformers/models/bloom/modeling_bloom.py @@ -428,7 +428,7 @@ class BloomBlock(GradientCheckpointingLayer): @auto_docstring class BloomPreTrainedModel(PreTrainedModel): - config_class = BloomConfig + config: BloomConfig base_model_prefix = "transformer" supports_gradient_checkpointing = True _no_split_modules = ["BloomBlock"] diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py index 47f68fe23c..42d85da5e5 100644 --- a/src/transformers/models/bridgetower/modeling_bridgetower.py +++ b/src/transformers/models/bridgetower/modeling_bridgetower.py @@ -943,7 +943,7 @@ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_l @auto_docstring class BridgeTowerPreTrainedModel(PreTrainedModel): - config_class = BridgeTowerConfig + config: BridgeTowerConfig base_model_prefix = "bridgetower" supports_gradient_checkpointing = False _no_split_modules = ["BridgeTowerSelfAttention", "BridgeTowerResidualAttention"] @@ -977,7 +977,7 @@ class BridgeTowerPreTrainedModel(PreTrainedModel): class BridgeTowerVisionModel(BridgeTowerPreTrainedModel): - config_class = BridgeTowerVisionConfig + config: BridgeTowerVisionConfig def __init__(self, config): super().__init__(config) @@ -1006,7 +1006,7 @@ class BridgeTowerVisionModel(BridgeTowerPreTrainedModel): """ ) class BridgeTowerTextModel(BridgeTowerPreTrainedModel): - config_class = BridgeTowerTextConfig + config: BridgeTowerTextConfig def __init__(self, config, add_pooling_layer=True): r""" diff --git a/src/transformers/models/bros/modeling_bros.py b/src/transformers/models/bros/modeling_bros.py index dd8ecf2c9e..97a2f7fcf2 100755 --- a/src/transformers/models/bros/modeling_bros.py +++ b/src/transformers/models/bros/modeling_bros.py @@ -583,7 +583,7 @@ class BrosRelationExtractor(nn.Module): @auto_docstring class BrosPreTrainedModel(PreTrainedModel): - config_class = BrosConfig + config: BrosConfig base_model_prefix = "bros" def _init_weights(self, module): diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py index dcb0d243d0..c0271d2f27 100644 --- a/src/transformers/models/camembert/modeling_camembert.py +++ b/src/transformers/models/camembert/modeling_camembert.py @@ -691,7 +691,7 @@ class CamembertPooler(nn.Module): @auto_docstring class CamembertPreTrainedModel(PreTrainedModel): - config_class = CamembertConfig + config: CamembertConfig base_model_prefix = "roberta" supports_gradient_checkpointing = True _supports_sdpa = True diff --git a/src/transformers/models/canine/modeling_canine.py b/src/transformers/models/canine/modeling_canine.py index 9866aad87a..1f83c15b4f 100644 --- a/src/transformers/models/canine/modeling_canine.py +++ b/src/transformers/models/canine/modeling_canine.py @@ -873,7 +873,7 @@ class CanineOnlyMLMHead(nn.Module): @auto_docstring class CaninePreTrainedModel(PreTrainedModel): - config_class = CanineConfig + config: CanineConfig load_tf_weights = load_tf_weights_in_canine base_model_prefix = "canine" supports_gradient_checkpointing = True diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py index fe4899c7e9..9516f8b496 100644 --- a/src/transformers/models/chameleon/modeling_chameleon.py +++ b/src/transformers/models/chameleon/modeling_chameleon.py @@ -807,7 +807,7 @@ class ChameleonImageVocabularyMapping: @auto_docstring class ChameleonPreTrainedModel(PreTrainedModel): - config_class = ChameleonConfig + config: ChameleonConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["ChameleonDecoderLayer", "ChameleonSwinDecoderLayer"] @@ -847,7 +847,7 @@ class ChameleonPreTrainedModel(PreTrainedModel): """ ) class ChameleonVQVAE(ChameleonPreTrainedModel): - config_class = ChameleonVQVAEConfig + config: ChameleonVQVAEConfig _no_split_modules = ["ChameleonVQVAEVectorQuantizer"] def __init__(self, config: ChameleonVQVAEConfig): diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py index afe7bdb06a..6fcc04a940 100644 --- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py +++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py @@ -607,7 +607,7 @@ class ChineseCLIPTextPooler(nn.Module): @auto_docstring class ChineseCLIPPreTrainedModel(PreTrainedModel): - config_class = ChineseCLIPConfig + config: ChineseCLIPConfig base_model_prefix = "chinese_clip" supports_gradient_checkpointing = True @@ -856,7 +856,7 @@ class ChineseCLIPTextModel(ChineseCLIPPreTrainedModel): `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. """ - config_class = ChineseCLIPTextConfig + config: ChineseCLIPTextConfig _no_split_modules = ["ChineseCLIPTextEmbeddings"] def __init__(self, config, add_pooling_layer=True): @@ -972,7 +972,7 @@ class ChineseCLIPTextModel(ChineseCLIPPreTrainedModel): """ ) class ChineseCLIPVisionModel(ChineseCLIPPreTrainedModel): - config_class = ChineseCLIPVisionConfig + config: ChineseCLIPVisionConfig main_input_name = "pixel_values" _no_split_modules = ["ChineseCLIPVisionEmbeddings", "ChineseCLIPVisionAttention"] @@ -1027,7 +1027,7 @@ class ChineseCLIPVisionModel(ChineseCLIPPreTrainedModel): @auto_docstring class ChineseCLIPModel(ChineseCLIPPreTrainedModel): - config_class = ChineseCLIPConfig + config: ChineseCLIPConfig def __init__(self, config: ChineseCLIPConfig): super().__init__(config) diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index 76bf6ef097..870aab4e2c 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -1400,7 +1400,7 @@ class ClapTextPooler(nn.Module): @auto_docstring class ClapPreTrainedModel(PreTrainedModel): - config_class = ClapConfig + config: ClapConfig base_model_prefix = "clap" supports_gradient_checkpointing = False @@ -1428,7 +1428,7 @@ class ClapPreTrainedModel(PreTrainedModel): class ClapAudioModel(ClapPreTrainedModel): - config_class = ClapAudioConfig + config: ClapAudioConfig main_input_name = "input_features" def __init__(self, config: ClapAudioConfig): @@ -1501,7 +1501,7 @@ class ClapAudioModel(ClapPreTrainedModel): """ ) class ClapTextModel(ClapPreTrainedModel): - config_class = ClapTextConfig + config: ClapTextConfig def __init__(self, config, add_pooling_layer=True): r""" @@ -1611,7 +1611,7 @@ class ClapTextModel(ClapPreTrainedModel): @auto_docstring class ClapModel(ClapPreTrainedModel): - config_class = ClapConfig + config: ClapConfig def __init__(self, config: ClapConfig): super().__init__(config) @@ -1843,7 +1843,7 @@ class ClapModel(ClapPreTrainedModel): @auto_docstring class ClapTextModelWithProjection(ClapPreTrainedModel): - config_class = ClapTextConfig + config: ClapTextConfig def __init__(self, config: ClapTextConfig): super().__init__(config) @@ -1908,7 +1908,7 @@ class ClapTextModelWithProjection(ClapPreTrainedModel): @auto_docstring class ClapAudioModelWithProjection(ClapPreTrainedModel): - config_class = ClapAudioConfig + config: ClapAudioConfig main_input_name = "input_features" def __init__(self, config: ClapAudioConfig): diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index eb697fcccf..a187bdaa63 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -424,7 +424,7 @@ class CLIPEncoderLayer(GradientCheckpointingLayer): @auto_docstring class CLIPPreTrainedModel(PreTrainedModel): - config_class = CLIPConfig + config: CLIPConfig base_model_prefix = "clip" supports_gradient_checkpointing = True _supports_sdpa = True @@ -670,7 +670,7 @@ class CLIPTextTransformer(nn.Module): """ ) class CLIPTextModel(CLIPPreTrainedModel): - config_class = CLIPTextConfig + config: CLIPTextConfig _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"] @@ -775,7 +775,7 @@ class CLIPVisionTransformer(nn.Module): """ ) class CLIPVisionModel(CLIPPreTrainedModel): - config_class = CLIPVisionConfig + config: CLIPVisionConfig main_input_name = "pixel_values" _no_split_modules = ["CLIPEncoderLayer"] @@ -828,7 +828,7 @@ class CLIPVisionModel(CLIPPreTrainedModel): @auto_docstring class CLIPModel(CLIPPreTrainedModel): - config_class = CLIPConfig + config: CLIPConfig _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer", "CLIPVisionEmbeddings"] def __init__(self, config: CLIPConfig): @@ -1050,7 +1050,7 @@ class CLIPModel(CLIPPreTrainedModel): @auto_docstring class CLIPTextModelWithProjection(CLIPPreTrainedModel): - config_class = CLIPTextConfig + config: CLIPTextConfig _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"] @@ -1116,7 +1116,7 @@ class CLIPTextModelWithProjection(CLIPPreTrainedModel): @auto_docstring class CLIPVisionModelWithProjection(CLIPPreTrainedModel): - config_class = CLIPVisionConfig + config: CLIPVisionConfig main_input_name = "pixel_values" def __init__(self, config: CLIPVisionConfig): diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index b6a12e6e63..46c7b1fcf2 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -428,7 +428,7 @@ class CLIPSegEncoderLayer(GradientCheckpointingLayer): @auto_docstring class CLIPSegPreTrainedModel(PreTrainedModel): - config_class = CLIPSegConfig + config: CLIPSegConfig base_model_prefix = "clip" supports_gradient_checkpointing = True @@ -653,7 +653,7 @@ class CLIPSegTextTransformer(nn.Module): class CLIPSegTextModel(CLIPSegPreTrainedModel): - config_class = CLIPSegTextConfig + config: CLIPSegTextConfig _no_split_modules = ["CLIPSegTextEmbeddings", "CLIPSegEncoderLayer"] @@ -757,7 +757,7 @@ class CLIPSegVisionTransformer(nn.Module): class CLIPSegVisionModel(CLIPSegPreTrainedModel): - config_class = CLIPSegVisionConfig + config: CLIPSegVisionConfig main_input_name = "pixel_values" def __init__(self, config: CLIPSegVisionConfig): @@ -809,7 +809,7 @@ class CLIPSegVisionModel(CLIPSegPreTrainedModel): @auto_docstring class CLIPSegModel(CLIPSegPreTrainedModel): - config_class = CLIPSegConfig + config: CLIPSegConfig def __init__(self, config: CLIPSegConfig): super().__init__(config) @@ -1200,7 +1200,7 @@ class CLIPSegDecoder(CLIPSegPreTrainedModel): """ ) class CLIPSegForImageSegmentation(CLIPSegPreTrainedModel): - config_class = CLIPSegConfig + config: CLIPSegConfig def __init__(self, config: CLIPSegConfig): super().__init__(config) diff --git a/src/transformers/models/clvp/modeling_clvp.py b/src/transformers/models/clvp/modeling_clvp.py index 16a079c3b8..bf55aa402b 100644 --- a/src/transformers/models/clvp/modeling_clvp.py +++ b/src/transformers/models/clvp/modeling_clvp.py @@ -783,7 +783,7 @@ class ClvpConditioningEncoder(nn.Module): @auto_docstring class ClvpPreTrainedModel(PreTrainedModel): - config_class = ClvpConfig + config: ClvpConfig base_model_prefix = "clvp" supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" @@ -1434,7 +1434,7 @@ class ClvpForCausalLM(ClvpPreTrainedModel, GenerationMixin): """ ) class ClvpModelForConditionalGeneration(ClvpPreTrainedModel, GenerationMixin): - config_class = ClvpConfig + config: ClvpConfig def __init__(self, config: ClvpConfig): super().__init__(config) diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py index acc7877ede..29798287dd 100644 --- a/src/transformers/models/codegen/modeling_codegen.py +++ b/src/transformers/models/codegen/modeling_codegen.py @@ -281,7 +281,7 @@ class CodeGenBlock(GradientCheckpointingLayer): @auto_docstring class CodeGenPreTrainedModel(PreTrainedModel): - config_class = CodeGenConfig + config: CodeGenConfig base_model_prefix = "transformer" supports_gradient_checkpointing = True _no_split_modules = ["CodeGenBlock"] diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index 9a60159137..0180161e13 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -336,7 +336,7 @@ class CohereDecoderLayer(GradientCheckpointingLayer): @auto_docstring class CoherePreTrainedModel(PreTrainedModel): - config_class = CohereConfig + config: CohereConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["CohereDecoderLayer"] diff --git a/src/transformers/models/cohere2/modeling_cohere2.py b/src/transformers/models/cohere2/modeling_cohere2.py index 10e65a1802..a051551690 100644 --- a/src/transformers/models/cohere2/modeling_cohere2.py +++ b/src/transformers/models/cohere2/modeling_cohere2.py @@ -313,7 +313,7 @@ class Cohere2DecoderLayer(GradientCheckpointingLayer): @auto_docstring class Cohere2PreTrainedModel(PreTrainedModel): - config_class = Cohere2Config + config: Cohere2Config base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["Cohere2DecoderLayer"] diff --git a/src/transformers/models/cohere2/modular_cohere2.py b/src/transformers/models/cohere2/modular_cohere2.py index 4552f57bf1..369b2ecae3 100644 --- a/src/transformers/models/cohere2/modular_cohere2.py +++ b/src/transformers/models/cohere2/modular_cohere2.py @@ -377,7 +377,7 @@ class Cohere2DecoderLayer(CohereDecoderLayer): class Cohere2PreTrainedModel(CoherePreTrainedModel): - config_class = Cohere2Config + config: Cohere2Config class Cohere2Model(Gemma2Model): diff --git a/src/transformers/models/colpali/modeling_colpali.py b/src/transformers/models/colpali/modeling_colpali.py index 1f38ea407a..63ce8975e8 100644 --- a/src/transformers/models/colpali/modeling_colpali.py +++ b/src/transformers/models/colpali/modeling_colpali.py @@ -30,7 +30,7 @@ from .configuration_colpali import ColPaliConfig @auto_docstring class ColPaliPreTrainedModel(PreTrainedModel): - config_class = ColPaliConfig + config: ColPaliConfig base_model_prefix = "model" _no_split_modules = [] diff --git a/src/transformers/models/colqwen2/modeling_colqwen2.py b/src/transformers/models/colqwen2/modeling_colqwen2.py index 7ef0acf080..6cbdaab123 100644 --- a/src/transformers/models/colqwen2/modeling_colqwen2.py +++ b/src/transformers/models/colqwen2/modeling_colqwen2.py @@ -38,7 +38,7 @@ if is_torch_available(): @auto_docstring class ColQwen2PreTrainedModel(PreTrainedModel): - config_class = ColQwen2Config + config: ColQwen2Config base_model_prefix = "model" _no_split_modules = [] _supports_flash_attn = True diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py index bd9bd8a514..25eacb959a 100644 --- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py +++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py @@ -956,7 +956,7 @@ class MLP(nn.Module): @auto_docstring # Copied from transformers.models.detr.modeling_detr.DetrPreTrainedModel with Detr->ConditionalDetr class ConditionalDetrPreTrainedModel(PreTrainedModel): - config_class = ConditionalDetrConfig + config: ConditionalDetrConfig base_model_prefix = "model" main_input_name = "pixel_values" _no_split_modules = [r"ConditionalDetrConvEncoder", r"ConditionalDetrEncoderLayer", r"ConditionalDetrDecoderLayer"] diff --git a/src/transformers/models/convbert/modeling_convbert.py b/src/transformers/models/convbert/modeling_convbert.py index a43ebfa025..130cf18384 100755 --- a/src/transformers/models/convbert/modeling_convbert.py +++ b/src/transformers/models/convbert/modeling_convbert.py @@ -232,7 +232,7 @@ class ConvBertEmbeddings(nn.Module): @auto_docstring class ConvBertPreTrainedModel(PreTrainedModel): - config_class = ConvBertConfig + config: ConvBertConfig load_tf_weights = load_tf_weights_in_convbert base_model_prefix = "convbert" supports_gradient_checkpointing = True diff --git a/src/transformers/models/convnext/modeling_convnext.py b/src/transformers/models/convnext/modeling_convnext.py index 81e262be04..5921aabc0f 100755 --- a/src/transformers/models/convnext/modeling_convnext.py +++ b/src/transformers/models/convnext/modeling_convnext.py @@ -253,7 +253,7 @@ class ConvNextEncoder(nn.Module): @auto_docstring class ConvNextPreTrainedModel(PreTrainedModel): - config_class = ConvNextConfig + config: ConvNextConfig base_model_prefix = "convnext" main_input_name = "pixel_values" _no_split_modules = ["ConvNextLayer"] diff --git a/src/transformers/models/convnextv2/modeling_convnextv2.py b/src/transformers/models/convnextv2/modeling_convnextv2.py index c2b3372c97..b9d57b87e9 100644 --- a/src/transformers/models/convnextv2/modeling_convnextv2.py +++ b/src/transformers/models/convnextv2/modeling_convnextv2.py @@ -273,7 +273,7 @@ class ConvNextV2Encoder(nn.Module): @auto_docstring class ConvNextV2PreTrainedModel(PreTrainedModel): - config_class = ConvNextV2Config + config: ConvNextV2Config base_model_prefix = "convnextv2" main_input_name = "pixel_values" _no_split_modules = ["ConvNextV2Layer"] diff --git a/src/transformers/models/cpmant/modeling_cpmant.py b/src/transformers/models/cpmant/modeling_cpmant.py index d68521c6dd..e1b1c33276 100755 --- a/src/transformers/models/cpmant/modeling_cpmant.py +++ b/src/transformers/models/cpmant/modeling_cpmant.py @@ -523,7 +523,7 @@ class CpmAntOutput(nn.Module): @auto_docstring class CpmAntPreTrainedModel(PreTrainedModel): - config_class = CpmAntConfig + config: CpmAntConfig base_model_prefix = "cpmant" def _init_weights(self, module): diff --git a/src/transformers/models/csm/modeling_csm.py b/src/transformers/models/csm/modeling_csm.py index d7807065e5..91b3ff3987 100644 --- a/src/transformers/models/csm/modeling_csm.py +++ b/src/transformers/models/csm/modeling_csm.py @@ -361,7 +361,7 @@ class CsmDecoderLayer(GradientCheckpointingLayer): ) @auto_docstring class CsmPreTrainedModel(PreTrainedModel): - config_class = CsmConfig + config: CsmConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["CsmDecoderLayer"] @@ -398,7 +398,7 @@ class CsmPreTrainedModel(PreTrainedModel): @auto_docstring class CsmDepthDecoderModel(CsmPreTrainedModel): - config_class = CsmDepthDecoderConfig + config: CsmDepthDecoderConfig def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/csm/modular_csm.py b/src/transformers/models/csm/modular_csm.py index ffde5c82eb..e1bc64fd6c 100644 --- a/src/transformers/models/csm/modular_csm.py +++ b/src/transformers/models/csm/modular_csm.py @@ -124,7 +124,7 @@ class CsmDecoderLayer(LlamaDecoderLayer): ) @auto_docstring class CsmPreTrainedModel(PreTrainedModel): - config_class = CsmConfig + config: CsmConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["CsmDecoderLayer"] @@ -161,7 +161,7 @@ class CsmPreTrainedModel(PreTrainedModel): @auto_docstring class CsmDepthDecoderModel(LlamaModel, CsmPreTrainedModel): - config_class = CsmDepthDecoderConfig + config: CsmDepthDecoderConfig def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/ctrl/modeling_ctrl.py b/src/transformers/models/ctrl/modeling_ctrl.py index ba1a737efc..675ef96592 100644 --- a/src/transformers/models/ctrl/modeling_ctrl.py +++ b/src/transformers/models/ctrl/modeling_ctrl.py @@ -210,7 +210,7 @@ class EncoderLayer(nn.Module): @auto_docstring class CTRLPreTrainedModel(PreTrainedModel): - config_class = CTRLConfig + config: CTRLConfig base_model_prefix = "transformer" def _init_weights(self, module): diff --git a/src/transformers/models/cvt/modeling_cvt.py b/src/transformers/models/cvt/modeling_cvt.py index aec8ec8123..e838ffb3cd 100644 --- a/src/transformers/models/cvt/modeling_cvt.py +++ b/src/transformers/models/cvt/modeling_cvt.py @@ -509,7 +509,7 @@ class CvtEncoder(nn.Module): @auto_docstring class CvtPreTrainedModel(PreTrainedModel): - config_class = CvtConfig + config: CvtConfig base_model_prefix = "cvt" main_input_name = "pixel_values" _no_split_modules = ["CvtLayer"] diff --git a/src/transformers/models/d_fine/modeling_d_fine.py b/src/transformers/models/d_fine/modeling_d_fine.py index b19cde98fc..76726a6512 100644 --- a/src/transformers/models/d_fine/modeling_d_fine.py +++ b/src/transformers/models/d_fine/modeling_d_fine.py @@ -882,7 +882,7 @@ def _get_clones(partial_module, N): @auto_docstring class DFinePreTrainedModel(PreTrainedModel): - config_class = DFineConfig + config: DFineConfig base_model_prefix = "d_fine" main_input_name = "pixel_values" _no_split_modules = [r"DFineHybridEncoder", r"DFineDecoderLayer"] diff --git a/src/transformers/models/dab_detr/modeling_dab_detr.py b/src/transformers/models/dab_detr/modeling_dab_detr.py index 119a7a0b16..d9060213f5 100644 --- a/src/transformers/models/dab_detr/modeling_dab_detr.py +++ b/src/transformers/models/dab_detr/modeling_dab_detr.py @@ -809,7 +809,7 @@ class DabDetrMLP(nn.Module): # Modified from transformers.models.detr.modeling_detr.DetrPreTrainedModel with Detr->DabDetr @auto_docstring class DabDetrPreTrainedModel(PreTrainedModel): - config_class = DabDetrConfig + config: DabDetrConfig base_model_prefix = "model" main_input_name = "pixel_values" _no_split_modules = [r"DabDetrConvEncoder", r"DabDetrEncoderLayer", r"DabDetrDecoderLayer"] diff --git a/src/transformers/models/dac/modeling_dac.py b/src/transformers/models/dac/modeling_dac.py index 398d258bef..03227e72cf 100644 --- a/src/transformers/models/dac/modeling_dac.py +++ b/src/transformers/models/dac/modeling_dac.py @@ -472,7 +472,7 @@ class DacEncoder(nn.Module): @auto_docstring class DacPreTrainedModel(PreTrainedAudioTokenizerBase): - config_class = DacConfig + config: DacConfig base_model_prefix = "dac" main_input_name = "input_values" diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py index 419bcb7b68..622e28b008 100755 --- a/src/transformers/models/data2vec/modeling_data2vec_audio.py +++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py @@ -500,7 +500,7 @@ class Data2VecAudioAdapter(nn.Module): @auto_docstring class Data2VecAudioPreTrainedModel(PreTrainedModel): - config_class = Data2VecAudioConfig + config: Data2VecAudioConfig base_model_prefix = "data2vec_audio" main_input_name = "input_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py index 93d217eed1..32afbfc320 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_text.py +++ b/src/transformers/models/data2vec/modeling_data2vec_text.py @@ -570,7 +570,7 @@ class Data2VecTextPooler(nn.Module): @auto_docstring class Data2VecTextPreTrainedModel(PreTrainedModel): - config_class = Data2VecTextConfig + config: Data2VecTextConfig base_model_prefix = "data2vec_text" supports_gradient_checkpointing = True _no_split_modules = ["Data2VecTextForTextEmbeddings", "Data2VecTextLayer"] diff --git a/src/transformers/models/data2vec/modeling_data2vec_vision.py b/src/transformers/models/data2vec/modeling_data2vec_vision.py index 2cf64ac21f..48c103cf64 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_vision.py +++ b/src/transformers/models/data2vec/modeling_data2vec_vision.py @@ -736,7 +736,7 @@ class Data2VecVisionEncoder(nn.Module): @auto_docstring # Copied from transformers.models.beit.modeling_beit.BeitPreTrainedModel with Beit->Data2VecVision,beit->data2vec_vision class Data2VecVisionPreTrainedModel(PreTrainedModel): - config_class = Data2VecVisionConfig + config: Data2VecVisionConfig base_model_prefix = "data2vec_vision" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/data2vec/modular_data2vec_audio.py b/src/transformers/models/data2vec/modular_data2vec_audio.py index 314e08ed4e..0be5019c01 100644 --- a/src/transformers/models/data2vec/modular_data2vec_audio.py +++ b/src/transformers/models/data2vec/modular_data2vec_audio.py @@ -135,7 +135,7 @@ class Data2VecAudioAdapter(Wav2Vec2Adapter): class Data2VecAudioPreTrainedModel(PreTrainedModel, Wav2Vec2PreTrainedModel): - config_class = Data2VecAudioConfig + config: Data2VecAudioConfig base_model_prefix = "data2vec_audio" main_input_name = "input_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py index 4e3d2cb1b6..86b4944f08 100644 --- a/src/transformers/models/dbrx/modeling_dbrx.py +++ b/src/transformers/models/dbrx/modeling_dbrx.py @@ -802,7 +802,7 @@ class DbrxBlock(GradientCheckpointingLayer): @auto_docstring class DbrxPreTrainedModel(PreTrainedModel): - config_class = DbrxConfig + config: DbrxConfig base_model_prefix = "transformer" supports_gradient_checkpointing = True _no_split_modules = ["DbrxBlock"] diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py index c6dd97736c..0e298f5229 100644 --- a/src/transformers/models/deberta/modeling_deberta.py +++ b/src/transformers/models/deberta/modeling_deberta.py @@ -610,7 +610,7 @@ class DebertaEncoder(nn.Module): @auto_docstring class DebertaPreTrainedModel(PreTrainedModel): - config_class = DebertaConfig + config: DebertaConfig base_model_prefix = "deberta" _keys_to_ignore_on_load_unexpected = ["position_embeddings"] supports_gradient_checkpointing = True diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index 9089fe1f65..047d4b3acd 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -690,7 +690,7 @@ class DebertaV2Encoder(nn.Module): @auto_docstring class DebertaV2PreTrainedModel(PreTrainedModel): - config_class = DebertaV2Config + config: DebertaV2Config base_model_prefix = "deberta" _keys_to_ignore_on_load_unexpected = ["position_embeddings"] supports_gradient_checkpointing = True diff --git a/src/transformers/models/decision_transformer/modeling_decision_transformer.py b/src/transformers/models/decision_transformer/modeling_decision_transformer.py index 689fff29e2..1b33296d7d 100755 --- a/src/transformers/models/decision_transformer/modeling_decision_transformer.py +++ b/src/transformers/models/decision_transformer/modeling_decision_transformer.py @@ -447,7 +447,7 @@ class DecisionTransformerGPT2Block(GradientCheckpointingLayer): @auto_docstring class DecisionTransformerGPT2PreTrainedModel(PreTrainedModel): - config_class = DecisionTransformerConfig + config: DecisionTransformerConfig load_tf_weights = load_tf_weights_in_gpt2 base_model_prefix = "transformer" is_parallelizable = True @@ -740,7 +740,7 @@ class DecisionTransformerPreTrainedModel(PreTrainedModel): models. """ - config_class = DecisionTransformerConfig + config: DecisionTransformerConfig base_model_prefix = "decision_transformer" main_input_name = "states" supports_gradient_checkpointing = False diff --git a/src/transformers/models/deepseek_v2/modeling_deepseek_v2.py b/src/transformers/models/deepseek_v2/modeling_deepseek_v2.py index ff36b6d43f..3794a392fd 100644 --- a/src/transformers/models/deepseek_v2/modeling_deepseek_v2.py +++ b/src/transformers/models/deepseek_v2/modeling_deepseek_v2.py @@ -450,7 +450,7 @@ class DeepseekV2DecoderLayer(GradientCheckpointingLayer): @auto_docstring class DeepseekV2PreTrainedModel(PreTrainedModel): - config_class = DeepseekV2Config + config: DeepseekV2Config base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["DeepseekV2DecoderLayer"] diff --git a/src/transformers/models/deepseek_v3/modeling_deepseek_v3.py b/src/transformers/models/deepseek_v3/modeling_deepseek_v3.py index 708a370171..9f10d63044 100644 --- a/src/transformers/models/deepseek_v3/modeling_deepseek_v3.py +++ b/src/transformers/models/deepseek_v3/modeling_deepseek_v3.py @@ -489,7 +489,7 @@ class DeepseekV3DecoderLayer(GradientCheckpointingLayer): @auto_docstring class DeepseekV3PreTrainedModel(PreTrainedModel): - config_class = DeepseekV3Config + config: DeepseekV3Config base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["DeepseekV3DecoderLayer"] diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py index 8c26e2ffaf..db74c715b5 100755 --- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py +++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py @@ -916,7 +916,7 @@ class DeformableDetrDecoderLayer(GradientCheckpointingLayer): @auto_docstring class DeformableDetrPreTrainedModel(PreTrainedModel): - config_class = DeformableDetrConfig + config: DeformableDetrConfig base_model_prefix = "model" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py index 2966d465f6..7499b6d7aa 100644 --- a/src/transformers/models/deit/modeling_deit.py +++ b/src/transformers/models/deit/modeling_deit.py @@ -447,7 +447,7 @@ class DeiTEncoder(nn.Module): @auto_docstring class DeiTPreTrainedModel(PreTrainedModel): - config_class = DeiTConfig + config: DeiTConfig base_model_prefix = "deit" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/deprecated/deta/modeling_deta.py b/src/transformers/models/deprecated/deta/modeling_deta.py index 64912b512f..e109b84938 100644 --- a/src/transformers/models/deprecated/deta/modeling_deta.py +++ b/src/transformers/models/deprecated/deta/modeling_deta.py @@ -1022,7 +1022,7 @@ class DetaDecoderLayer(GradientCheckpointingLayer): class DetaPreTrainedModel(PreTrainedModel): - config_class = DetaConfig + config: DetaConfig base_model_prefix = "model" main_input_name = "pixel_values" _no_split_modules = [r"DetaBackboneWithPositionalEncodings", r"DetaEncoderLayer", r"DetaDecoderLayer"] diff --git a/src/transformers/models/deprecated/efficientformer/modeling_efficientformer.py b/src/transformers/models/deprecated/efficientformer/modeling_efficientformer.py index 5e1a0cdf9a..7d75e45dbc 100644 --- a/src/transformers/models/deprecated/efficientformer/modeling_efficientformer.py +++ b/src/transformers/models/deprecated/efficientformer/modeling_efficientformer.py @@ -500,7 +500,7 @@ class EfficientFormerPreTrainedModel(PreTrainedModel): models. """ - config_class = EfficientFormerConfig + config: EfficientFormerConfig base_model_prefix = "efficientformer" main_input_name = "pixel_values" supports_gradient_checkpointing = False diff --git a/src/transformers/models/deprecated/ernie_m/modeling_ernie_m.py b/src/transformers/models/deprecated/ernie_m/modeling_ernie_m.py index cf41dc2e29..69e5e61d3c 100755 --- a/src/transformers/models/deprecated/ernie_m/modeling_ernie_m.py +++ b/src/transformers/models/deprecated/ernie_m/modeling_ernie_m.py @@ -400,7 +400,7 @@ class ErnieMPreTrainedModel(PreTrainedModel): models. """ - config_class = ErnieMConfig + config: ErnieMConfig base_model_prefix = "ernie_m" def _init_weights(self, module): diff --git a/src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py b/src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py index 25c56354c3..10664a8fef 100644 --- a/src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py +++ b/src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py @@ -673,7 +673,7 @@ class GPTSanJapanesePreTrainedModel(PreTrainedModel): models. """ - config_class = GPTSanJapaneseConfig + config: GPTSanJapaneseConfig base_model_prefix = "gptsan_japanese" supports_gradient_checkpointing = False _no_split_modules = ["GPTSanJapaneseBlock"] diff --git a/src/transformers/models/deprecated/graphormer/modeling_graphormer.py b/src/transformers/models/deprecated/graphormer/modeling_graphormer.py index 337e127683..b3e8ea742c 100755 --- a/src/transformers/models/deprecated/graphormer/modeling_graphormer.py +++ b/src/transformers/models/deprecated/graphormer/modeling_graphormer.py @@ -704,7 +704,7 @@ class GraphormerPreTrainedModel(PreTrainedModel): models. """ - config_class = GraphormerConfig + config: GraphormerConfig base_model_prefix = "graphormer" main_input_name_nodes = "input_nodes" main_input_name_edges = "input_edges" diff --git a/src/transformers/models/deprecated/jukebox/modeling_jukebox.py b/src/transformers/models/deprecated/jukebox/modeling_jukebox.py index 779519c946..4dfd1c6924 100755 --- a/src/transformers/models/deprecated/jukebox/modeling_jukebox.py +++ b/src/transformers/models/deprecated/jukebox/modeling_jukebox.py @@ -598,7 +598,7 @@ Ringer, Tom Ash, John Hughes, David MacLeod, Jamie Dougherty](https://huggingfac JUKEBOX_START_DOCSTRING, ) class JukeboxVQVAE(PreTrainedModel): - config_class = JukeboxVQVAEConfig + config: JukeboxVQVAEConfig base_model_prefix = "vqvae" def _init_weights(self, module): @@ -1788,7 +1788,7 @@ class JukeboxPrior(PreTrainedModel): the vqvae module to avoid getting the parameters. """ - config_class = JukeboxPriorConfig + config: JukeboxPriorConfig def _init_weights(self, module): init_scale = self.config.init_scale @@ -2264,7 +2264,7 @@ class JukeboxPreTrainedModel(PreTrainedModel): models. """ - config_class = JukeboxConfig + config: JukeboxConfig base_model_prefix = "jukebox" supports_gradient_checkpointing = False diff --git a/src/transformers/models/deprecated/mctct/modeling_mctct.py b/src/transformers/models/deprecated/mctct/modeling_mctct.py index 7bc835cf13..adaf4c1a70 100755 --- a/src/transformers/models/deprecated/mctct/modeling_mctct.py +++ b/src/transformers/models/deprecated/mctct/modeling_mctct.py @@ -423,7 +423,7 @@ class MCTCTPreTrainedModel(PreTrainedModel): models. """ - config_class = MCTCTConfig + config: MCTCTConfig base_model_prefix = "mctct" main_input_name = "input_features" supports_gradient_checkpointing = True diff --git a/src/transformers/models/deprecated/mega/modeling_mega.py b/src/transformers/models/deprecated/mega/modeling_mega.py index 24e0b557f9..85f314aeea 100644 --- a/src/transformers/models/deprecated/mega/modeling_mega.py +++ b/src/transformers/models/deprecated/mega/modeling_mega.py @@ -1329,7 +1329,7 @@ class MegaPreTrainedModel(PreTrainedModel): models. """ - config_class = MegaConfig + config: MegaConfig base_model_prefix = "mega" supports_gradient_checkpointing = False _no_split_modules = ["MegaMovingAverageGatedAttention"] diff --git a/src/transformers/models/deprecated/nat/modeling_nat.py b/src/transformers/models/deprecated/nat/modeling_nat.py index 15c7dc62b3..0a951623bc 100644 --- a/src/transformers/models/deprecated/nat/modeling_nat.py +++ b/src/transformers/models/deprecated/nat/modeling_nat.py @@ -615,7 +615,7 @@ class NatPreTrainedModel(PreTrainedModel): models. """ - config_class = NatConfig + config: NatConfig base_model_prefix = "nat" main_input_name = "pixel_values" diff --git a/src/transformers/models/deprecated/nezha/modeling_nezha.py b/src/transformers/models/deprecated/nezha/modeling_nezha.py index 2bead71cad..692d5dd092 100644 --- a/src/transformers/models/deprecated/nezha/modeling_nezha.py +++ b/src/transformers/models/deprecated/nezha/modeling_nezha.py @@ -699,7 +699,7 @@ class NezhaPreTrainedModel(PreTrainedModel): models. """ - config_class = NezhaConfig + config: NezhaConfig load_tf_weights = load_tf_weights_in_nezha base_model_prefix = "nezha" supports_gradient_checkpointing = True diff --git a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py index 2d473cd423..66efbe1c24 100644 --- a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py +++ b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py @@ -431,7 +431,7 @@ OPEN_LLAMA_START_DOCSTRING = r""" OPEN_LLAMA_START_DOCSTRING, ) class OpenLlamaPreTrainedModel(PreTrainedModel): - config_class = OpenLlamaConfig + config: OpenLlamaConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["OpenLlamaDecoderLayer"] diff --git a/src/transformers/models/deprecated/qdqbert/modeling_qdqbert.py b/src/transformers/models/deprecated/qdqbert/modeling_qdqbert.py index 914428b96a..7245f44c34 100755 --- a/src/transformers/models/deprecated/qdqbert/modeling_qdqbert.py +++ b/src/transformers/models/deprecated/qdqbert/modeling_qdqbert.py @@ -708,7 +708,7 @@ class QDQBertPreTrainedModel(PreTrainedModel): models. """ - config_class = QDQBertConfig + config: QDQBertConfig load_tf_weights = load_tf_weights_in_qdqbert base_model_prefix = "bert" supports_gradient_checkpointing = True diff --git a/src/transformers/models/deprecated/realm/modeling_realm.py b/src/transformers/models/deprecated/realm/modeling_realm.py index 68787c60e9..767bcf5a9c 100644 --- a/src/transformers/models/deprecated/realm/modeling_realm.py +++ b/src/transformers/models/deprecated/realm/modeling_realm.py @@ -940,7 +940,7 @@ class RealmPreTrainedModel(PreTrainedModel): models. """ - config_class = RealmConfig + config: RealmConfig load_tf_weights = load_tf_weights_in_realm base_model_prefix = "realm" diff --git a/src/transformers/models/deprecated/retribert/modeling_retribert.py b/src/transformers/models/deprecated/retribert/modeling_retribert.py index bcae1c0239..06806e8e6d 100644 --- a/src/transformers/models/deprecated/retribert/modeling_retribert.py +++ b/src/transformers/models/deprecated/retribert/modeling_retribert.py @@ -39,7 +39,7 @@ class RetriBertPreTrainedModel(PreTrainedModel): models. """ - config_class = RetriBertConfig + config: RetriBertConfig load_tf_weights = None base_model_prefix = "retribert" diff --git a/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py index ed4c96c89b..1012c9537a 100755 --- a/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py +++ b/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py @@ -385,7 +385,7 @@ class Speech2Text2DecoderLayer(GradientCheckpointingLayer): class Speech2Text2PreTrainedModel(PreTrainedModel): - config_class = Speech2Text2Config + config: Speech2Text2Config base_model_prefix = "model" supports_gradient_checkpointing = True diff --git a/src/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py b/src/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py index fdfbecf7fe..cf49223b8b 100644 --- a/src/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py +++ b/src/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py @@ -153,7 +153,7 @@ class TrajectoryTransformerPreTrainedModel(PreTrainedModel): models. """ - config_class = TrajectoryTransformerConfig + config: TrajectoryTransformerConfig load_tf_weights = load_tf_weights_in_trajectory_transformer base_model_prefix = "trajectory_transformer" main_input_name = "trajectories" diff --git a/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py b/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py index 664bc577b1..9c469036f2 100644 --- a/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py +++ b/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py @@ -458,7 +458,7 @@ class TransfoXLPreTrainedModel(PreTrainedModel): models. """ - config_class = TransfoXLConfig + config: TransfoXLConfig load_tf_weights = load_tf_weights_in_transfo_xl base_model_prefix = "transformer" diff --git a/src/transformers/models/deprecated/tvlt/modeling_tvlt.py b/src/transformers/models/deprecated/tvlt/modeling_tvlt.py index 5280248c59..5f34083ac2 100644 --- a/src/transformers/models/deprecated/tvlt/modeling_tvlt.py +++ b/src/transformers/models/deprecated/tvlt/modeling_tvlt.py @@ -572,7 +572,7 @@ class TvltPreTrainedModel(PreTrainedModel): models. """ - config_class = TvltConfig + config: TvltConfig base_model_prefix = "tvlt" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/deprecated/van/modeling_van.py b/src/transformers/models/deprecated/van/modeling_van.py index 2d9917164d..7bbae4edb1 100644 --- a/src/transformers/models/deprecated/van/modeling_van.py +++ b/src/transformers/models/deprecated/van/modeling_van.py @@ -361,7 +361,7 @@ class VanPreTrainedModel(PreTrainedModel): models. """ - config_class = VanConfig + config: VanConfig base_model_prefix = "van" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/deprecated/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/deprecated/vit_hybrid/modeling_vit_hybrid.py index 03bcc24beb..7d1c22301d 100644 --- a/src/transformers/models/deprecated/vit_hybrid/modeling_vit_hybrid.py +++ b/src/transformers/models/deprecated/vit_hybrid/modeling_vit_hybrid.py @@ -483,7 +483,7 @@ class ViTHybridPreTrainedModel(PreTrainedModel): models. """ - config_class = ViTHybridConfig + config: ViTHybridConfig base_model_prefix = "vit" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py b/src/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py index 19d7988a69..9912cca22f 100644 --- a/src/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py +++ b/src/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py @@ -540,7 +540,7 @@ class XLMProphetNetDecoderLMOutput(ModelOutput): class XLMProphetNetPreTrainedModel(PreTrainedModel): - config_class = XLMProphetNetConfig + config: XLMProphetNetConfig base_model_prefix = "prophetnet" supports_gradient_checkpointing = True diff --git a/src/transformers/models/depth_anything/modeling_depth_anything.py b/src/transformers/models/depth_anything/modeling_depth_anything.py index f5ace46212..6fb3b23cd3 100644 --- a/src/transformers/models/depth_anything/modeling_depth_anything.py +++ b/src/transformers/models/depth_anything/modeling_depth_anything.py @@ -211,7 +211,7 @@ class DepthAnythingFeatureFusionStage(nn.Module): # avoiding sdpa and flash_attn_2 support, it's done in the backend @auto_docstring class DepthAnythingPreTrainedModel(PreTrainedModel): - config_class = DepthAnythingConfig + config: DepthAnythingConfig base_model_prefix = "depth_anything" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index cd96a3c1b8..f894a6d8ff 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -607,7 +607,7 @@ class DepthProNeck(nn.Module): @auto_docstring class DepthProPreTrainedModel(PreTrainedModel): - config_class = DepthProConfig + config: DepthProConfig base_model_prefix = "depth_pro" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index e485fba776..d2a205fb21 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -717,7 +717,7 @@ class DetrDecoderLayer(GradientCheckpointingLayer): @auto_docstring class DetrPreTrainedModel(PreTrainedModel): - config_class = DetrConfig + config: DetrConfig base_model_prefix = "model" main_input_name = "pixel_values" _no_split_modules = [r"DetrConvEncoder", r"DetrEncoderLayer", r"DetrDecoderLayer"] diff --git a/src/transformers/models/dia/modeling_dia.py b/src/transformers/models/dia/modeling_dia.py index da0f616eda..2bf05cf683 100644 --- a/src/transformers/models/dia/modeling_dia.py +++ b/src/transformers/models/dia/modeling_dia.py @@ -61,7 +61,7 @@ logger = logging.get_logger(__name__) @auto_docstring class DiaPreTrainedModel(PreTrainedModel): - config_class = DiaConfig + config: DiaConfig base_model_prefix = "model" supports_gradient_checkpointing = True _supports_flash_attn = True diff --git a/src/transformers/models/dia/modular_dia.py b/src/transformers/models/dia/modular_dia.py index 7da15d7c10..8c84d936c5 100644 --- a/src/transformers/models/dia/modular_dia.py +++ b/src/transformers/models/dia/modular_dia.py @@ -56,7 +56,7 @@ logger = logging.get_logger(__name__) @auto_docstring class DiaPreTrainedModel(PreTrainedModel): - config_class = DiaConfig + config: DiaConfig base_model_prefix = "model" supports_gradient_checkpointing = True _supports_flash_attn = True diff --git a/src/transformers/models/diffllama/modeling_diffllama.py b/src/transformers/models/diffllama/modeling_diffllama.py index e2b093fd8e..c97319622d 100644 --- a/src/transformers/models/diffllama/modeling_diffllama.py +++ b/src/transformers/models/diffllama/modeling_diffllama.py @@ -525,7 +525,7 @@ class DiffLlamaDecoderLayer(GradientCheckpointingLayer): @auto_docstring class DiffLlamaPreTrainedModel(PreTrainedModel): - config_class = DiffLlamaConfig + config: DiffLlamaConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["DiffLlamaDecoderLayer"] diff --git a/src/transformers/models/dinat/modeling_dinat.py b/src/transformers/models/dinat/modeling_dinat.py index 140d16bd33..916fc94a79 100644 --- a/src/transformers/models/dinat/modeling_dinat.py +++ b/src/transformers/models/dinat/modeling_dinat.py @@ -583,7 +583,7 @@ class DinatEncoder(nn.Module): @auto_docstring class DinatPreTrainedModel(PreTrainedModel): - config_class = DinatConfig + config: DinatConfig base_model_prefix = "dinat" main_input_name = "pixel_values" diff --git a/src/transformers/models/dinov2/modeling_dinov2.py b/src/transformers/models/dinov2/modeling_dinov2.py index 102b15a5fb..5748cb91da 100644 --- a/src/transformers/models/dinov2/modeling_dinov2.py +++ b/src/transformers/models/dinov2/modeling_dinov2.py @@ -491,7 +491,7 @@ class Dinov2Encoder(nn.Module): @auto_docstring class Dinov2PreTrainedModel(PreTrainedModel): - config_class = Dinov2Config + config: Dinov2Config base_model_prefix = "dinov2" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py b/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py index 0c09e2f75d..69621a0c1e 100644 --- a/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +++ b/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py @@ -509,7 +509,7 @@ class Dinov2WithRegistersEncoder(nn.Module): @auto_docstring class Dinov2WithRegistersPreTrainedModel(PreTrainedModel): - config_class = Dinov2WithRegistersConfig + config: Dinov2WithRegistersConfig base_model_prefix = "dinov2_with_registers" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py index feb8d6d8bc..c3b604d566 100755 --- a/src/transformers/models/distilbert/modeling_distilbert.py +++ b/src/transformers/models/distilbert/modeling_distilbert.py @@ -571,7 +571,7 @@ class Transformer(nn.Module): # INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL # @auto_docstring class DistilBertPreTrainedModel(PreTrainedModel): - config_class = DistilBertConfig + config: DistilBertConfig load_tf_weights = None base_model_prefix = "distilbert" supports_gradient_checkpointing = True diff --git a/src/transformers/models/doge/modeling_doge.py b/src/transformers/models/doge/modeling_doge.py index 29aa4b1961..63813fe3d1 100644 --- a/src/transformers/models/doge/modeling_doge.py +++ b/src/transformers/models/doge/modeling_doge.py @@ -486,7 +486,7 @@ class DogeDecoderLayer(GradientCheckpointingLayer): @auto_docstring class DogePreTrainedModel(PreTrainedModel): - config_class = DogeConfig + config: DogeConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["DogeDecoderLayer"] diff --git a/src/transformers/models/donut/modeling_donut_swin.py b/src/transformers/models/donut/modeling_donut_swin.py index 7af6a3ad07..289844d0bf 100644 --- a/src/transformers/models/donut/modeling_donut_swin.py +++ b/src/transformers/models/donut/modeling_donut_swin.py @@ -826,7 +826,7 @@ class DonutSwinEncoder(nn.Module): @auto_docstring # Copied from transformers.models.swin.modeling_swin.SwinPreTrainedModel with Swin->DonutSwin,swin->donut class DonutSwinPreTrainedModel(PreTrainedModel): - config_class = DonutSwinConfig + config: DonutSwinConfig base_model_prefix = "donut" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/dots1/modeling_dots1.py b/src/transformers/models/dots1/modeling_dots1.py index 665aa1b85d..6a84d77a05 100644 --- a/src/transformers/models/dots1/modeling_dots1.py +++ b/src/transformers/models/dots1/modeling_dots1.py @@ -409,7 +409,7 @@ class Dots1DecoderLayer(GradientCheckpointingLayer): @auto_docstring class Dots1PreTrainedModel(PreTrainedModel): - config_class = Dots1Config + config: Dots1Config base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["Dots1DecoderLayer"] diff --git a/src/transformers/models/dpr/modeling_dpr.py b/src/transformers/models/dpr/modeling_dpr.py index 3e18b3e732..f1ae00a02e 100644 --- a/src/transformers/models/dpr/modeling_dpr.py +++ b/src/transformers/models/dpr/modeling_dpr.py @@ -246,7 +246,7 @@ class DPRPretrainedContextEncoder(DPRPreTrainedModel): models. """ - config_class = DPRConfig + config: DPRConfig load_tf_weights = None base_model_prefix = "ctx_encoder" @@ -257,7 +257,7 @@ class DPRPretrainedQuestionEncoder(DPRPreTrainedModel): models. """ - config_class = DPRConfig + config: DPRConfig load_tf_weights = None base_model_prefix = "question_encoder" @@ -268,7 +268,7 @@ class DPRPretrainedReader(DPRPreTrainedModel): models. """ - config_class = DPRConfig + config: DPRConfig load_tf_weights = None base_model_prefix = "span_predictor" diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py index fd9fd48993..60692d9fa7 100755 --- a/src/transformers/models/dpt/modeling_dpt.py +++ b/src/transformers/models/dpt/modeling_dpt.py @@ -815,7 +815,7 @@ class DPTFeatureFusionLayer(nn.Module): @auto_docstring class DPTPreTrainedModel(PreTrainedModel): - config_class = DPTConfig + config: DPTConfig base_model_prefix = "dpt" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/efficientnet/modeling_efficientnet.py b/src/transformers/models/efficientnet/modeling_efficientnet.py index 26410fc120..814a2375ac 100644 --- a/src/transformers/models/efficientnet/modeling_efficientnet.py +++ b/src/transformers/models/efficientnet/modeling_efficientnet.py @@ -432,7 +432,7 @@ class EfficientNetEncoder(nn.Module): @auto_docstring class EfficientNetPreTrainedModel(PreTrainedModel): - config_class = EfficientNetConfig + config: EfficientNetConfig base_model_prefix = "efficientnet" main_input_name = "pixel_values" _no_split_modules = [] diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py index c22b47c55a..ef7e8051e6 100644 --- a/src/transformers/models/electra/modeling_electra.py +++ b/src/transformers/models/electra/modeling_electra.py @@ -646,7 +646,7 @@ class ElectraGeneratorPredictions(nn.Module): @auto_docstring class ElectraPreTrainedModel(PreTrainedModel): - config_class = ElectraConfig + config: ElectraConfig load_tf_weights = load_tf_weights_in_electra base_model_prefix = "electra" supports_gradient_checkpointing = True @@ -1284,7 +1284,7 @@ class ElectraForTokenClassification(ElectraPreTrainedModel): @auto_docstring class ElectraForQuestionAnswering(ElectraPreTrainedModel): - config_class = ElectraConfig + config: ElectraConfig base_model_prefix = "electra" def __init__(self, config): diff --git a/src/transformers/models/emu3/modeling_emu3.py b/src/transformers/models/emu3/modeling_emu3.py index 6633abc494..b66e244234 100644 --- a/src/transformers/models/emu3/modeling_emu3.py +++ b/src/transformers/models/emu3/modeling_emu3.py @@ -922,7 +922,7 @@ class Emu3VQVAEDecoder(nn.Module): """ ) class Emu3VQVAE(PreTrainedModel): - config_class = Emu3VQVAEConfig + config: Emu3VQVAEConfig base_model_prefix = "emuvideovq" main_input_name = "pixel_values" _supports_sdpa = True @@ -1088,7 +1088,7 @@ class Emu3ImageVocabularyMapping: @auto_docstring class Emu3PreTrainedModel(PreTrainedModel): - config_class = Emu3Config + config: Emu3Config base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = [ @@ -1246,7 +1246,7 @@ class Emu3ForCausalLM(Emu3PreTrainedModel, GenerationMixin): _tied_weights_keys = ["lm_head.weight"] _tp_plan = {"lm_head": "colwise_rep"} _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} - config_class = Emu3TextConfig + config: Emu3TextConfig def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/emu3/modular_emu3.py b/src/transformers/models/emu3/modular_emu3.py index 58689de09b..e32cfd2dfc 100644 --- a/src/transformers/models/emu3/modular_emu3.py +++ b/src/transformers/models/emu3/modular_emu3.py @@ -674,7 +674,7 @@ class Emu3VQVAEDecoder(nn.Module): """ ) class Emu3VQVAE(PreTrainedModel): - config_class = Emu3VQVAEConfig + config: Emu3VQVAEConfig base_model_prefix = "emuvideovq" main_input_name = "pixel_values" _supports_sdpa = True @@ -873,7 +873,7 @@ class Emu3TextModel(LlamaModel, Emu3PreTrainedModel): class Emu3ForCausalLM(LlamaForCausalLM, Emu3PreTrainedModel, GenerationMixin): - config_class = Emu3TextConfig + config: Emu3TextConfig def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/encodec/modeling_encodec.py b/src/transformers/models/encodec/modeling_encodec.py index 6e610ba295..cbdfd038e7 100644 --- a/src/transformers/models/encodec/modeling_encodec.py +++ b/src/transformers/models/encodec/modeling_encodec.py @@ -446,7 +446,7 @@ class EncodecResidualVectorQuantizer(nn.Module): @auto_docstring class EncodecPreTrainedModel(PreTrainedModel): - config_class = EncodecConfig + config: EncodecConfig base_model_prefix = "encodec" main_input_name = "input_values" diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py index 38c7e2197b..44d8e6cd7e 100644 --- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py +++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py @@ -73,7 +73,7 @@ class EncoderDecoderModel(PreTrainedModel, GenerationMixin): :meth*~transformers.AutoModelForCausalLM.from_pretrained* class method for the decoder. """ - config_class = EncoderDecoderConfig + config: EncoderDecoderConfig base_model_prefix = "encoder_decoder" main_input_name = "input_ids" supports_gradient_checkpointing = True diff --git a/src/transformers/models/eomt/modeling_eomt.py b/src/transformers/models/eomt/modeling_eomt.py index 29c33cec05..c0f5461f50 100644 --- a/src/transformers/models/eomt/modeling_eomt.py +++ b/src/transformers/models/eomt/modeling_eomt.py @@ -995,7 +995,7 @@ class EomtPreTrainedModel(PreTrainedModel): models. """ - config_class = EomtConfig + config: EomtConfig base_model_prefix = "eomt" main_input_name = "pixel_values" supports_gradient_checkpointing = False diff --git a/src/transformers/models/eomt/modular_eomt.py b/src/transformers/models/eomt/modular_eomt.py index 0a1b7dfd95..a98cf4f779 100644 --- a/src/transformers/models/eomt/modular_eomt.py +++ b/src/transformers/models/eomt/modular_eomt.py @@ -367,7 +367,7 @@ class EomtPreTrainedModel(PreTrainedModel): models. """ - config_class = EomtConfig + config: EomtConfig base_model_prefix = "eomt" main_input_name = "pixel_values" supports_gradient_checkpointing = False diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py index a5d55bafd7..d291db4e6b 100644 --- a/src/transformers/models/ernie/modeling_ernie.py +++ b/src/transformers/models/ernie/modeling_ernie.py @@ -630,7 +630,7 @@ class ErniePreTrainingHeads(nn.Module): @auto_docstring class ErniePreTrainedModel(PreTrainedModel): - config_class = ErnieConfig + config: ErnieConfig base_model_prefix = "ernie" supports_gradient_checkpointing = True diff --git a/src/transformers/models/esm/modeling_esm.py b/src/transformers/models/esm/modeling_esm.py index c9388e5881..d7fd324285 100755 --- a/src/transformers/models/esm/modeling_esm.py +++ b/src/transformers/models/esm/modeling_esm.py @@ -738,7 +738,7 @@ class EsmPooler(nn.Module): @auto_docstring class EsmPreTrainedModel(PreTrainedModel): - config_class = EsmConfig + config: EsmConfig base_model_prefix = "esm" supports_gradient_checkpointing = True _no_split_modules = ["EsmLayer", "EsmFoldTriangularSelfAttentionBlock", "EsmEmbeddings"] diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 8e03e28c0d..392472d537 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -636,7 +636,7 @@ class FalconDecoderLayer(GradientCheckpointingLayer): @auto_docstring class FalconPreTrainedModel(PreTrainedModel): - config_class = FalconConfig + config: FalconConfig base_model_prefix = "transformer" supports_gradient_checkpointing = True _no_split_modules = ["FalconDecoderLayer"] diff --git a/src/transformers/models/falcon_h1/modeling_falcon_h1.py b/src/transformers/models/falcon_h1/modeling_falcon_h1.py index a4ab2fe8d1..3c6a8e500e 100644 --- a/src/transformers/models/falcon_h1/modeling_falcon_h1.py +++ b/src/transformers/models/falcon_h1/modeling_falcon_h1.py @@ -1145,7 +1145,7 @@ class FalconH1DecoderLayer(GradientCheckpointingLayer): @auto_docstring class FalconH1PreTrainedModel(PreTrainedModel): - config_class = FalconH1Config + config: FalconH1Config base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["FalconH1DecoderLayer"] diff --git a/src/transformers/models/falcon_h1/modular_falcon_h1.py b/src/transformers/models/falcon_h1/modular_falcon_h1.py index bd4f63375b..305fd7bfbb 100644 --- a/src/transformers/models/falcon_h1/modular_falcon_h1.py +++ b/src/transformers/models/falcon_h1/modular_falcon_h1.py @@ -923,7 +923,7 @@ class FalconH1DecoderLayer(GradientCheckpointingLayer): @auto_docstring class FalconH1PreTrainedModel(PreTrainedModel): - config_class = FalconH1Config + config: FalconH1Config base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["FalconH1DecoderLayer"] diff --git a/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py b/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py index 942053be3e..c32157ab11 100644 --- a/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py +++ b/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py @@ -437,7 +437,7 @@ class FalconMambaBlock(GradientCheckpointingLayer): @auto_docstring # Copied from transformers.models.mamba.modeling_mamba.MambaPreTrainedModel with Mamba->FalconMamba class FalconMambaPreTrainedModel(PreTrainedModel): - config_class = FalconMambaConfig + config: FalconMambaConfig base_model_prefix = "backbone" _no_split_modules = ["FalconMambaBlock", "FalconMambaMixer"] supports_gradient_checkpointing = True diff --git a/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py index f19ea88c17..08039ef2d6 100644 --- a/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +++ b/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py @@ -959,7 +959,7 @@ class FastSpeech2ConformerLoss(nn.Module): @auto_docstring class FastSpeech2ConformerPreTrainedModel(PreTrainedModel): - config_class = FastSpeech2ConformerConfig + config: FastSpeech2ConformerConfig base_model_prefix = "fastspeech2_conformer" main_input_name = "input_ids" @@ -1331,7 +1331,7 @@ class HifiGanResidualBlock(nn.Module): ) # Copied from transformers.models.speecht5.modeling_speecht5.SpeechT5HifiGan with SpeechT5->FastSpeech2Conformer class FastSpeech2ConformerHifiGan(PreTrainedModel): - config_class = FastSpeech2ConformerHifiGanConfig + config: FastSpeech2ConformerHifiGanConfig main_input_name = "spectrogram" def __init__(self, config: FastSpeech2ConformerHifiGanConfig): @@ -1455,7 +1455,7 @@ class FastSpeech2ConformerHifiGan(PreTrainedModel): """ ) class FastSpeech2ConformerWithHifiGan(PreTrainedModel): - config_class = FastSpeech2ConformerWithHifiGanConfig + config: FastSpeech2ConformerWithHifiGanConfig def __init__(self, config: FastSpeech2ConformerWithHifiGanConfig): super().__init__(config) diff --git a/src/transformers/models/flaubert/modeling_flaubert.py b/src/transformers/models/flaubert/modeling_flaubert.py index 7699d3f31f..3643119988 100644 --- a/src/transformers/models/flaubert/modeling_flaubert.py +++ b/src/transformers/models/flaubert/modeling_flaubert.py @@ -678,7 +678,7 @@ class FlaubertSequenceSummary(nn.Module): @auto_docstring # Copied from transformers.models.xlm.modeling_xlm.XLMPreTrainedModel with XLM->Flaubert class FlaubertPreTrainedModel(PreTrainedModel): - config_class = FlaubertConfig + config: FlaubertConfig load_tf_weights = None base_model_prefix = "transformer" diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py index 64a61e66b5..63eadf41c3 100644 --- a/src/transformers/models/flava/modeling_flava.py +++ b/src/transformers/models/flava/modeling_flava.py @@ -694,7 +694,7 @@ class FlavaPooler(nn.Module): @auto_docstring class FlavaPreTrainedModel(PreTrainedModel): - config_class = FlavaConfig + config: FlavaConfig base_model_prefix = "flava" supports_gradient_checkpointing = True @@ -729,7 +729,7 @@ class FlavaPreTrainedModel(PreTrainedModel): @auto_docstring class FlavaImageModel(FlavaPreTrainedModel): - config_class = FlavaImageConfig + config: FlavaImageConfig # This override allows us to load FlavaImageModel from FlavaModel/FlavaForPreTraining checkpoints. base_model_prefix = "flava.image_model" main_input_name = "pixel_values" @@ -826,7 +826,7 @@ class FlavaImageModel(FlavaPreTrainedModel): @auto_docstring class FlavaTextModel(FlavaPreTrainedModel): - config_class = FlavaTextConfig + config: FlavaTextConfig # This override allows us to load FlavaTextModel from FlavaModel/FlavaForPreTraining checkpoints. base_model_prefix = "flava.text_model" @@ -939,7 +939,7 @@ class FlavaTextModel(FlavaPreTrainedModel): @auto_docstring class FlavaMultimodalModel(FlavaPreTrainedModel): - config_class = FlavaMultimodalConfig + config: FlavaMultimodalConfig # This override allows us to load FlavaMultimodalModel from FlavaModel/FlavaForPreTraining checkpoints. base_model_prefix = "flava.multimodal_model" main_input_name = "hidden_states" @@ -1035,7 +1035,7 @@ class FlavaMultimodalModel(FlavaPreTrainedModel): @auto_docstring class FlavaModel(FlavaPreTrainedModel): - config_class = FlavaConfig + config: FlavaConfig def __init__(self, config: FlavaConfig): super().__init__(config) @@ -1400,7 +1400,7 @@ class FlavaImageCodebookLayerGroup(nn.Module): ) class FlavaImageCodebook(FlavaPreTrainedModel): base_model_prefix = "" - config_class = FlavaImageCodebookConfig + config: FlavaImageCodebookConfig main_input_name = "pixel_values" supports_gradient_checkpointing = False diff --git a/src/transformers/models/fnet/modeling_fnet.py b/src/transformers/models/fnet/modeling_fnet.py index e35031ebc3..1cb0e764b2 100755 --- a/src/transformers/models/fnet/modeling_fnet.py +++ b/src/transformers/models/fnet/modeling_fnet.py @@ -386,7 +386,7 @@ class FNetPreTrainingHeads(nn.Module): @auto_docstring class FNetPreTrainedModel(PreTrainedModel): - config_class = FNetConfig + config: FNetConfig base_model_prefix = "fnet" supports_gradient_checkpointing = True diff --git a/src/transformers/models/focalnet/modeling_focalnet.py b/src/transformers/models/focalnet/modeling_focalnet.py index 091424fb41..93bdae6096 100644 --- a/src/transformers/models/focalnet/modeling_focalnet.py +++ b/src/transformers/models/focalnet/modeling_focalnet.py @@ -582,7 +582,7 @@ class FocalNetEncoder(nn.Module): @auto_docstring class FocalNetPreTrainedModel(PreTrainedModel): - config_class = FocalNetConfig + config: FocalNetConfig base_model_prefix = "focalnet" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py index 52f3d027c8..66fdca387a 100644 --- a/src/transformers/models/fsmt/modeling_fsmt.py +++ b/src/transformers/models/fsmt/modeling_fsmt.py @@ -217,7 +217,7 @@ def _prepare_fsmt_decoder_inputs( @auto_docstring class PretrainedFSMTModel(PreTrainedModel): - config_class = FSMTConfig + config: FSMTConfig base_model_prefix = "model" def _init_weights(self, module): diff --git a/src/transformers/models/funnel/modeling_funnel.py b/src/transformers/models/funnel/modeling_funnel.py index 9f17bf08a0..4370344ccc 100644 --- a/src/transformers/models/funnel/modeling_funnel.py +++ b/src/transformers/models/funnel/modeling_funnel.py @@ -760,7 +760,7 @@ class FunnelDiscriminatorPredictions(nn.Module): @auto_docstring class FunnelPreTrainedModel(PreTrainedModel): - config_class = FunnelConfig + config: FunnelConfig load_tf_weights = load_tf_weights_in_funnel base_model_prefix = "funnel" diff --git a/src/transformers/models/fuyu/modeling_fuyu.py b/src/transformers/models/fuyu/modeling_fuyu.py index 57e256c8fa..b7a4cb9a05 100644 --- a/src/transformers/models/fuyu/modeling_fuyu.py +++ b/src/transformers/models/fuyu/modeling_fuyu.py @@ -34,7 +34,7 @@ logger = logging.get_logger(__name__) @auto_docstring class FuyuPreTrainedModel(PreTrainedModel): - config_class = FuyuConfig + config: FuyuConfig base_model_prefix = "fuyu" supports_gradient_checkpointing = True _supports_attention_backend = True diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index 2a5c08f1b1..2aab29381a 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -305,7 +305,7 @@ class GemmaDecoderLayer(GradientCheckpointingLayer): @auto_docstring class GemmaPreTrainedModel(PreTrainedModel): - config_class = GemmaConfig + config: GemmaConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["GemmaDecoderLayer"] diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py index 3a45c3275a..df45009a92 100644 --- a/src/transformers/models/gemma2/modeling_gemma2.py +++ b/src/transformers/models/gemma2/modeling_gemma2.py @@ -335,7 +335,7 @@ class Gemma2RotaryEmbedding(nn.Module): @auto_docstring class Gemma2PreTrainedModel(PreTrainedModel): - config_class = Gemma2Config + config: Gemma2Config base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["Gemma2DecoderLayer"] diff --git a/src/transformers/models/gemma3/modeling_gemma3.py b/src/transformers/models/gemma3/modeling_gemma3.py index 5f939c8248..2aa3f99427 100644 --- a/src/transformers/models/gemma3/modeling_gemma3.py +++ b/src/transformers/models/gemma3/modeling_gemma3.py @@ -420,7 +420,7 @@ class Gemma3DecoderLayer(GradientCheckpointingLayer): @auto_docstring class Gemma3PreTrainedModel(PreTrainedModel): - config_class = Gemma3Config + config: Gemma3Config base_model_prefix = "" supports_gradient_checkpointing = True _no_split_modules = [ @@ -460,7 +460,7 @@ class Gemma3PreTrainedModel(PreTrainedModel): @auto_docstring class Gemma3TextModel(Gemma3PreTrainedModel): - config_class = Gemma3TextConfig + config: Gemma3TextConfig def __init__(self, config: Gemma3TextConfig): super().__init__(config) @@ -609,7 +609,7 @@ class Gemma3ForCausalLM(Gemma3PreTrainedModel, GenerationMixin): _tied_weights_keys = ["lm_head.weight"] _tp_plan = {"lm_head": "colwise_rep"} _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} - config_class = Gemma3TextConfig + config: Gemma3TextConfig base_model_prefix = "language_model" def __init__(self, config: Gemma3TextConfig): diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py index 85715626cc..62f0934bb8 100644 --- a/src/transformers/models/gemma3/modular_gemma3.py +++ b/src/transformers/models/gemma3/modular_gemma3.py @@ -546,7 +546,7 @@ class Gemma3PreTrainedModel(Gemma2PreTrainedModel): class Gemma3TextModel(Gemma2Model): - config_class = Gemma3TextConfig + config: Gemma3TextConfig def __init__(self, config: Gemma3TextConfig): super().__init__(config) @@ -672,7 +672,7 @@ class Gemma3TextModel(Gemma2Model): class Gemma3ForCausalLM(Gemma2ForCausalLM): - config_class = Gemma3TextConfig + config: Gemma3TextConfig base_model_prefix = "language_model" def __init__(self, config: Gemma3TextConfig): diff --git a/src/transformers/models/gemma3n/modeling_gemma3n.py b/src/transformers/models/gemma3n/modeling_gemma3n.py index c63f9b31c0..8b9b516d92 100644 --- a/src/transformers/models/gemma3n/modeling_gemma3n.py +++ b/src/transformers/models/gemma3n/modeling_gemma3n.py @@ -914,7 +914,7 @@ class Gemma3nAudioConformerBlock(nn.Module): class Gemma3nAudioEncoder(PreTrainedModel): """An audio encoder based on the [Universal Speech Model](https://arxiv.org/abs/2303.01037) architecture.""" - config_class = Gemma3nAudioConfig + config: Gemma3nAudioConfig main_input_name = "audio_mel" @@ -1481,7 +1481,7 @@ class Gemma3nTextDecoderLayer(GradientCheckpointingLayer): @auto_docstring class Gemma3nPreTrainedModel(PreTrainedModel): - config_class = Gemma3nConfig + config: Gemma3nConfig base_model_prefix = "" supports_gradient_checkpointing = True _no_split_modules = ["Gemma3nTextDecoderLayer"] @@ -1523,7 +1523,7 @@ class Gemma3nPreTrainedModel(PreTrainedModel): @auto_docstring(custom_intro="The base Gemma 3n language model without a language modeling head.") class Gemma3nTextModel(Gemma3nPreTrainedModel): - config_class = Gemma3nTextConfig + config: Gemma3nTextConfig def __init__(self, config: Gemma3nTextConfig): super().__init__(config) @@ -1780,7 +1780,7 @@ class Gemma3nForCausalLM(Gemma3nPreTrainedModel, GenerationMixin): _tied_weights_keys = ["lm_head.weight"] _tp_plan = {"lm_head": "colwise_rep"} _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} - config_class = Gemma3nTextConfig + config: Gemma3nTextConfig base_model_prefix = "model" _checkpoint_conversion_mapping = {"model.language_model": "model"} diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py index 84c5fcfb43..8e3bcfd1f1 100644 --- a/src/transformers/models/gemma3n/modular_gemma3n.py +++ b/src/transformers/models/gemma3n/modular_gemma3n.py @@ -1475,7 +1475,7 @@ class Gemma3nAudioConformerBlock(nn.Module): class Gemma3nAudioEncoder(PreTrainedModel): """An audio encoder based on the [Universal Speech Model](https://arxiv.org/abs/2303.01037) architecture.""" - config_class = Gemma3nAudioConfig + config: Gemma3nAudioConfig main_input_name = "audio_mel" @@ -1912,7 +1912,7 @@ class Gemma3nTextDecoderLayer(Gemma3DecoderLayer): class Gemma3nPreTrainedModel(Gemma2PreTrainedModel): - config_class = Gemma3nConfig + config: Gemma3nConfig base_model_prefix = "" _no_split_modules = ["Gemma3nTextDecoderLayer"] @@ -1942,7 +1942,7 @@ class Gemma3nPreTrainedModel(Gemma2PreTrainedModel): @auto_docstring(custom_intro="The base Gemma 3n language model without a language modeling head.") class Gemma3nTextModel(Gemma3TextModel): - config_class = Gemma3nTextConfig + config: Gemma3nTextConfig def __init__(self, config: Gemma3nTextConfig): super().__init__(config) diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index c6b1aa1f88..3312e80c79 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -453,7 +453,7 @@ class GitEncoder(nn.Module): @auto_docstring class GitPreTrainedModel(PreTrainedModel): - config_class = GitConfig + config: GitConfig base_model_prefix = "git" supports_gradient_checkpointing = True @@ -879,7 +879,7 @@ class GitVisionTransformer(nn.Module): """ ) class GitVisionModel(GitPreTrainedModel): - config_class = GitVisionConfig + config: GitVisionConfig main_input_name = "pixel_values" # Copied from transformers.models.clip.modeling_clip.CLIPVisionModel.__init__ with CLIP->Git diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py index 147ccde41a..d6cf8e137f 100644 --- a/src/transformers/models/glm/modeling_glm.py +++ b/src/transformers/models/glm/modeling_glm.py @@ -322,7 +322,7 @@ class GlmDecoderLayer(GradientCheckpointingLayer): @auto_docstring class GlmPreTrainedModel(PreTrainedModel): - config_class = GlmConfig + config: GlmConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["GlmDecoderLayer"] diff --git a/src/transformers/models/glm4/modeling_glm4.py b/src/transformers/models/glm4/modeling_glm4.py index b1c6421fe1..c3c9a0ab1f 100644 --- a/src/transformers/models/glm4/modeling_glm4.py +++ b/src/transformers/models/glm4/modeling_glm4.py @@ -326,7 +326,7 @@ class Glm4RotaryEmbedding(nn.Module): @auto_docstring class Glm4PreTrainedModel(PreTrainedModel): - config_class = Glm4Config + config: Glm4Config base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["Glm4DecoderLayer"] diff --git a/src/transformers/models/glm4v/modeling_glm4v.py b/src/transformers/models/glm4v/modeling_glm4v.py index 2e8a4149d7..90cddf636f 100644 --- a/src/transformers/models/glm4v/modeling_glm4v.py +++ b/src/transformers/models/glm4v/modeling_glm4v.py @@ -399,7 +399,7 @@ class Glm4vVisionBlock(GradientCheckpointingLayer): @auto_docstring class Glm4vPreTrainedModel(PreTrainedModel): - config_class = Glm4vConfig + config: Glm4vConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["Glm4vTextDecoderLayer", "Glm4vVisionBlock"] @@ -428,7 +428,7 @@ class Glm4vPreTrainedModel(PreTrainedModel): class Glm4vVisionModel(Glm4vPreTrainedModel): - config_class = Glm4vVisionConfig + config: Glm4vVisionConfig _no_split_modules = ["Glm4vVisionBlock"] def __init__(self, config) -> None: @@ -819,7 +819,7 @@ class Glm4vModelOutputWithPast(ModelOutput): @auto_docstring class Glm4vTextModel(Glm4vPreTrainedModel): - config_class = Glm4vTextConfig + config: Glm4vTextConfig def __init__(self, config: Glm4vTextConfig): super().__init__(config) @@ -950,7 +950,7 @@ class Glm4vTextModel(Glm4vPreTrainedModel): class Glm4vModel(Glm4vPreTrainedModel): base_model_prefix = "" _checkpoint_conversion_mapping = {} - config_class = Glm4vConfig + config: Glm4vConfig _no_split_modules = ["Glm4vTextDecoderLayer", "Glm4vVisionBlock"] def __init__(self, config): diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py index 945c9a7e7f..5db53e8b30 100644 --- a/src/transformers/models/glm4v/modular_glm4v.py +++ b/src/transformers/models/glm4v/modular_glm4v.py @@ -543,7 +543,7 @@ class Glm4vPreTrainedModel(Qwen2_5_VLPreTrainedModel): class Glm4vVisionModel(Glm4vPreTrainedModel): - config_class = Glm4vVisionConfig + config: Glm4vVisionConfig _no_split_modules = ["Glm4vVisionBlock"] def __init__(self, config) -> None: diff --git a/src/transformers/models/glm4v/processing_glm4v.py b/src/transformers/models/glm4v/processing_glm4v.py index c71804fc11..50408caa3f 100644 --- a/src/transformers/models/glm4v/processing_glm4v.py +++ b/src/transformers/models/glm4v/processing_glm4v.py @@ -39,13 +39,13 @@ class Glm4vImagesKwargs(ImagesKwargs): class Glm4vProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: Glm4vImagesKwargs + videos_kwargs: Glm4vVideosProcessorKwargs _defaults = { "text_kwargs": { "padding": False, }, } - images_kwargs: Glm4vImagesKwargs - videos_kwargs: Glm4vVideosProcessorKwargs class Glm4vProcessor(ProcessorMixin): diff --git a/src/transformers/models/glpn/modeling_glpn.py b/src/transformers/models/glpn/modeling_glpn.py index b21d2f14d7..65e7b9b265 100755 --- a/src/transformers/models/glpn/modeling_glpn.py +++ b/src/transformers/models/glpn/modeling_glpn.py @@ -409,7 +409,7 @@ class GLPNEncoder(nn.Module): @auto_docstring class GLPNPreTrainedModel(PreTrainedModel): - config_class = GLPNConfig + config: GLPNConfig base_model_prefix = "glpn" main_input_name = "pixel_values" _no_split_modules = [] diff --git a/src/transformers/models/got_ocr2/modeling_got_ocr2.py b/src/transformers/models/got_ocr2/modeling_got_ocr2.py index f11f12cd54..4ff79a53de 100644 --- a/src/transformers/models/got_ocr2/modeling_got_ocr2.py +++ b/src/transformers/models/got_ocr2/modeling_got_ocr2.py @@ -276,7 +276,7 @@ class GotOcr2VisionLayer(GradientCheckpointingLayer): @auto_docstring class GotOcr2PreTrainedModel(PreTrainedModel): - config_class = GotOcr2Config + config: GotOcr2Config base_model_prefix = "" supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py index 78cc233809..7039169519 100644 --- a/src/transformers/models/gpt2/modeling_gpt2.py +++ b/src/transformers/models/gpt2/modeling_gpt2.py @@ -552,7 +552,7 @@ class GPT2SequenceSummary(nn.Module): @auto_docstring class GPT2PreTrainedModel(PreTrainedModel): - config_class = GPT2Config + config: GPT2Config load_tf_weights = load_tf_weights_in_gpt2 base_model_prefix = "transformer" is_parallelizable = True diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py index aae9a3e9b0..89c280f93a 100644 --- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py @@ -359,7 +359,7 @@ class GPTBigCodeBlock(nn.Module): @auto_docstring class GPTBigCodePreTrainedModel(PreTrainedModel): - config_class = GPTBigCodeConfig + config: GPTBigCodeConfig base_model_prefix = "transformer" supports_gradient_checkpointing = True _no_split_modules = ["GPTBigCodeBlock"] diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py index e6df1a4225..cce119303a 100755 --- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py +++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py @@ -470,7 +470,7 @@ class GPTNeoBlock(GradientCheckpointingLayer): @auto_docstring class GPTNeoPreTrainedModel(PreTrainedModel): - config_class = GPTNeoConfig + config: GPTNeoConfig load_tf_weights = load_tf_weights_in_gpt_neo base_model_prefix = "transformer" supports_gradient_checkpointing = True diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index 511ac1a29c..b868d4353c 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -355,7 +355,7 @@ class GPTNeoXDecoderLayer(GradientCheckpointingLayer): @auto_docstring class GPTNeoXPreTrainedModel(PreTrainedModel): - config_class = GPTNeoXConfig + config: GPTNeoXConfig base_model_prefix = "gpt_neox" supports_gradient_checkpointing = True _no_split_modules = ["GPTNeoXLayer"] diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py index 6f8674b00b..9e1859e794 100755 --- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py @@ -43,7 +43,7 @@ logger = logging.get_logger(__name__) @auto_docstring class GPTNeoXJapanesePreTrainedModel(PreTrainedModel): - config_class = GPTNeoXJapaneseConfig + config: GPTNeoXJapaneseConfig base_model_prefix = "gpt_neox_japanese" _no_split_modules = ["GPTNeoXJapaneseLayer"] _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py index 7fcc7451ac..093dbf355d 100644 --- a/src/transformers/models/gptj/modeling_gptj.py +++ b/src/transformers/models/gptj/modeling_gptj.py @@ -465,7 +465,7 @@ class GPTJBlock(GradientCheckpointingLayer): @auto_docstring class GPTJPreTrainedModel(PreTrainedModel): - config_class = GPTJConfig + config: GPTJConfig base_model_prefix = "transformer" is_parallelizable = True supports_gradient_checkpointing = True diff --git a/src/transformers/models/granite/modeling_granite.py b/src/transformers/models/granite/modeling_granite.py index 87e2dfd793..13804b9c26 100644 --- a/src/transformers/models/granite/modeling_granite.py +++ b/src/transformers/models/granite/modeling_granite.py @@ -300,7 +300,7 @@ class GraniteDecoderLayer(GradientCheckpointingLayer): @auto_docstring class GranitePreTrainedModel(PreTrainedModel): - config_class = GraniteConfig + config: GraniteConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["GraniteDecoderLayer"] diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py index 2cfaa23582..18e16d3605 100644 --- a/src/transformers/models/granite_speech/modeling_granite_speech.py +++ b/src/transformers/models/granite_speech/modeling_granite_speech.py @@ -281,7 +281,7 @@ class GraniteSpeechCTCEncoder(nn.Module): @auto_docstring class GraniteSpeechPreTrainedModel(PreTrainedModel): - config_class = GraniteSpeechConfig + config: GraniteSpeechConfig _supports_flash_attn = True _supports_sdpa = True diff --git a/src/transformers/models/granitemoe/modeling_granitemoe.py b/src/transformers/models/granitemoe/modeling_granitemoe.py index 132c243493..caa9214183 100644 --- a/src/transformers/models/granitemoe/modeling_granitemoe.py +++ b/src/transformers/models/granitemoe/modeling_granitemoe.py @@ -584,7 +584,7 @@ class GraniteMoeDecoderLayer(GradientCheckpointingLayer): @auto_docstring class GraniteMoePreTrainedModel(PreTrainedModel): - config_class = GraniteMoeConfig + config: GraniteMoeConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["GraniteMoeDecoderLayer"] diff --git a/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py index 761bee178f..1ba85ba730 100644 --- a/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +++ b/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py @@ -1157,7 +1157,7 @@ class GraniteMoeHybridDecoderLayer(GradientCheckpointingLayer): @auto_docstring class GraniteMoeHybridPreTrainedModel(PreTrainedModel): - config_class = GraniteMoeHybridConfig + config: GraniteMoeHybridConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["GraniteMoeHybridDecoderLayer"] diff --git a/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py index eea61219ac..80c5cfd430 100644 --- a/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py +++ b/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py @@ -162,7 +162,7 @@ class GraniteMoeHybridDecoderLayer(GraniteMoeSharedDecoderLayer): class GraniteMoeHybridPreTrainedModel(GraniteMoeSharedPreTrainedModel): - config_class = GraniteMoeHybridConfig + config: GraniteMoeHybridConfig _no_split_modules = ["GraniteMoeHybridDecoderLayer"] _is_stateful = True diff --git a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py index 009ecbf0dd..7bd81f2db3 100644 --- a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py +++ b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py @@ -502,7 +502,7 @@ class GraniteMoeSharedDecoderLayer(GradientCheckpointingLayer): @auto_docstring class GraniteMoeSharedPreTrainedModel(PreTrainedModel): - config_class = GraniteMoeSharedConfig + config: GraniteMoeSharedConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["GraniteMoeSharedDecoderLayer"] diff --git a/src/transformers/models/granitemoeshared/modular_granitemoeshared.py b/src/transformers/models/granitemoeshared/modular_granitemoeshared.py index ee8fc48d38..29342cb625 100644 --- a/src/transformers/models/granitemoeshared/modular_granitemoeshared.py +++ b/src/transformers/models/granitemoeshared/modular_granitemoeshared.py @@ -147,7 +147,7 @@ class GraniteMoeSharedDecoderLayer(GraniteMoeDecoderLayer): class GraniteMoeSharedPreTrainedModel(GraniteMoePreTrainedModel): - config_class = GraniteMoeSharedConfig + config: GraniteMoeSharedConfig _no_split_modules = ["GraniteMoeSharedDecoderLayer"] diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 197c99c57b..8432e510ed 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1373,7 +1373,7 @@ class GroundingDinoContrastiveEmbedding(nn.Module): @auto_docstring class GroundingDinoPreTrainedModel(PreTrainedModel): - config_class = GroundingDinoConfig + config: GroundingDinoConfig base_model_prefix = "model" main_input_name = "pixel_values" diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py index a33e455753..c9673a128f 100644 --- a/src/transformers/models/groupvit/modeling_groupvit.py +++ b/src/transformers/models/groupvit/modeling_groupvit.py @@ -744,7 +744,7 @@ class GroupViTEncoderLayer(GradientCheckpointingLayer): @auto_docstring class GroupViTPreTrainedModel(PreTrainedModel): - config_class = GroupViTConfig + config: GroupViTConfig base_model_prefix = "groupvit" supports_gradient_checkpointing = True @@ -1021,7 +1021,7 @@ class GroupViTTextTransformer(nn.Module): class GroupViTTextModel(GroupViTPreTrainedModel): - config_class = GroupViTTextConfig + config: GroupViTTextConfig def __init__(self, config: GroupViTTextConfig): super().__init__(config) @@ -1124,7 +1124,7 @@ class GroupViTVisionTransformer(nn.Module): class GroupViTVisionModel(GroupViTPreTrainedModel): - config_class = GroupViTVisionConfig + config: GroupViTVisionConfig main_input_name = "pixel_values" def __init__(self, config: GroupViTVisionConfig): @@ -1174,7 +1174,7 @@ class GroupViTVisionModel(GroupViTPreTrainedModel): @auto_docstring class GroupViTModel(GroupViTPreTrainedModel): - config_class = GroupViTConfig + config: GroupViTConfig def __init__(self, config: GroupViTConfig): super().__init__(config) diff --git a/src/transformers/models/helium/modeling_helium.py b/src/transformers/models/helium/modeling_helium.py index 7140b89b44..f68f810dda 100644 --- a/src/transformers/models/helium/modeling_helium.py +++ b/src/transformers/models/helium/modeling_helium.py @@ -307,7 +307,7 @@ class HeliumDecoderLayer(GradientCheckpointingLayer): @auto_docstring class HeliumPreTrainedModel(PreTrainedModel): - config_class = HeliumConfig + config: HeliumConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["HeliumDecoderLayer"] diff --git a/src/transformers/models/hgnet_v2/modeling_hgnet_v2.py b/src/transformers/models/hgnet_v2/modeling_hgnet_v2.py index 4411a50488..e9620ade40 100644 --- a/src/transformers/models/hgnet_v2/modeling_hgnet_v2.py +++ b/src/transformers/models/hgnet_v2/modeling_hgnet_v2.py @@ -40,7 +40,7 @@ from .configuration_hgnet_v2 import HGNetV2Config @auto_docstring class HGNetV2PreTrainedModel(PreTrainedModel): - config_class = HGNetV2Config + config: HGNetV2Config base_model_prefix = "hgnetv2" main_input_name = "pixel_values" _no_split_modules = ["HGNetV2BasicLayer"] diff --git a/src/transformers/models/hgnet_v2/modular_hgnet_v2.py b/src/transformers/models/hgnet_v2/modular_hgnet_v2.py index dca1f87594..f5b8735f46 100644 --- a/src/transformers/models/hgnet_v2/modular_hgnet_v2.py +++ b/src/transformers/models/hgnet_v2/modular_hgnet_v2.py @@ -165,7 +165,7 @@ class HGNetV2Config(BackboneConfigMixin, PretrainedConfig): @auto_docstring class HGNetV2PreTrainedModel(PreTrainedModel): - config_class = HGNetV2Config + config: HGNetV2Config base_model_prefix = "hgnetv2" main_input_name = "pixel_values" _no_split_modules = ["HGNetV2BasicLayer"] diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py index 2efa7c2f37..2fcd827e89 100644 --- a/src/transformers/models/hiera/modeling_hiera.py +++ b/src/transformers/models/hiera/modeling_hiera.py @@ -791,7 +791,7 @@ def unroll( @auto_docstring class HieraPreTrainedModel(PreTrainedModel): - config_class = HieraConfig + config: HieraConfig base_model_prefix = "hiera" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index c4ab2cb2ec..63df33beeb 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -679,7 +679,7 @@ class HubertEncoderStableLayerNorm(nn.Module): @auto_docstring class HubertPreTrainedModel(PreTrainedModel): - config_class = HubertConfig + config: HubertConfig base_model_prefix = "hubert" main_input_name = "input_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/hubert/modular_hubert.py b/src/transformers/models/hubert/modular_hubert.py index 3e12c14e4c..facebcf445 100644 --- a/src/transformers/models/hubert/modular_hubert.py +++ b/src/transformers/models/hubert/modular_hubert.py @@ -125,7 +125,7 @@ class HubertEncoderStableLayerNorm(Wav2Vec2EncoderStableLayerNorm): @auto_docstring class HubertPreTrainedModel(PreTrainedModel): - config_class = HubertConfig + config: HubertConfig base_model_prefix = "hubert" main_input_name = "input_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/ibert/modeling_ibert.py b/src/transformers/models/ibert/modeling_ibert.py index 5d9c9b17e4..6b960148ca 100644 --- a/src/transformers/models/ibert/modeling_ibert.py +++ b/src/transformers/models/ibert/modeling_ibert.py @@ -623,7 +623,7 @@ class IBertPooler(nn.Module): @auto_docstring class IBertPreTrainedModel(PreTrainedModel): - config_class = IBertConfig + config: IBertConfig base_model_prefix = "ibert" def _init_weights(self, module): diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index 9812204e91..9ac09ca6d0 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -873,7 +873,7 @@ class IdeficsGatedCrossAttentionLayer(GradientCheckpointingLayer): @auto_docstring class IdeficsPreTrainedModel(PreTrainedModel): - config_class = IdeficsConfig + config: IdeficsConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["IdeficsDecoderLayer", "IdeficsGatedCrossAttentionLayer"] diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py index 6c93643a3f..60022d781a 100644 --- a/src/transformers/models/idefics2/modeling_idefics2.py +++ b/src/transformers/models/idefics2/modeling_idefics2.py @@ -451,7 +451,7 @@ class Idefics2Encoder(nn.Module): @auto_docstring class Idefics2PreTrainedModel(PreTrainedModel): - config_class = Idefics2Config + config: Idefics2Config base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["Idefics2VisionAttention", "Idefics2MLP", "Idefics2PerceiverLayer", "Idefics2DecoderLayer"] @@ -492,7 +492,7 @@ class Idefics2PreTrainedModel(PreTrainedModel): """ ) class Idefics2VisionTransformer(Idefics2PreTrainedModel): - config_class = Idefics2VisionConfig + config: Idefics2VisionConfig _supports_sdpa = True _supports_flash_attn = True _supports_flex_attn = True @@ -779,7 +779,7 @@ class Idefics2PerceiverLayer(nn.Module): """ ) class Idefics2PerceiverResampler(Idefics2PreTrainedModel): - config_class = Idefics2PerceiverConfig + config: Idefics2PerceiverConfig _supports_sdpa = True _supports_flash_attention_2 = True _supports_flex_attn = True diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py index 021e0d9e70..bb0cd4f70f 100644 --- a/src/transformers/models/idefics3/modeling_idefics3.py +++ b/src/transformers/models/idefics3/modeling_idefics3.py @@ -468,7 +468,7 @@ class Idefics3Connector(nn.Module): @auto_docstring class Idefics3PreTrainedModel(PreTrainedModel): - config_class = Idefics3Config + config: Idefics3Config base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["Idefics3VisionAttention", "Idefics3DecoderLayer"] @@ -503,7 +503,7 @@ class Idefics3PreTrainedModel(PreTrainedModel): """ ) class Idefics3VisionTransformer(Idefics3PreTrainedModel): - config_class = Idefics3VisionConfig + config: Idefics3VisionConfig _supports_sdpa = True _supports_flash_attn = True _supports_flex_attn = True diff --git a/src/transformers/models/ijepa/modeling_ijepa.py b/src/transformers/models/ijepa/modeling_ijepa.py index a0e89f406f..51d09184a1 100644 --- a/src/transformers/models/ijepa/modeling_ijepa.py +++ b/src/transformers/models/ijepa/modeling_ijepa.py @@ -145,7 +145,7 @@ class IJepaEmbeddings(nn.Module): @auto_docstring class IJepaPreTrainedModel(PreTrainedModel): - config_class = IJepaConfig + config: IJepaConfig base_model_prefix = "ijepa" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/ijepa/modular_ijepa.py b/src/transformers/models/ijepa/modular_ijepa.py index 231753cea5..4749c3f598 100644 --- a/src/transformers/models/ijepa/modular_ijepa.py +++ b/src/transformers/models/ijepa/modular_ijepa.py @@ -88,7 +88,7 @@ class IJepaEmbeddings(ViTEmbeddings): @auto_docstring class IJepaPreTrainedModel(PreTrainedModel): - config_class = IJepaConfig + config: IJepaConfig base_model_prefix = "ijepa" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py index 2041c615df..cbac5c2792 100755 --- a/src/transformers/models/imagegpt/modeling_imagegpt.py +++ b/src/transformers/models/imagegpt/modeling_imagegpt.py @@ -498,7 +498,7 @@ class ImageGPTBlock(GradientCheckpointingLayer): @auto_docstring class ImageGPTPreTrainedModel(PreTrainedModel): - config_class = ImageGPTConfig + config: ImageGPTConfig load_tf_weights = load_tf_weights_in_imagegpt base_model_prefix = "transformer" main_input_name = "input_ids" diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py index abd55a22bf..d914b25531 100644 --- a/src/transformers/models/informer/modeling_informer.py +++ b/src/transformers/models/informer/modeling_informer.py @@ -251,7 +251,7 @@ class InformerValueEmbedding(nn.Module): @auto_docstring class InformerPreTrainedModel(PreTrainedModel): - config_class = InformerConfig + config: InformerConfig base_model_prefix = "model" main_input_name = "past_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/informer/modular_informer.py b/src/transformers/models/informer/modular_informer.py index 79d7c66114..4e306b1753 100644 --- a/src/transformers/models/informer/modular_informer.py +++ b/src/transformers/models/informer/modular_informer.py @@ -92,7 +92,7 @@ class InformerValueEmbedding(TimeSeriesValueEmbedding): @auto_docstring class InformerPreTrainedModel(PreTrainedModel): - config_class = InformerConfig + config: InformerConfig base_model_prefix = "model" main_input_name = "past_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py index c5af37a0d9..b88c003660 100644 --- a/src/transformers/models/instructblip/modeling_instructblip.py +++ b/src/transformers/models/instructblip/modeling_instructblip.py @@ -332,7 +332,7 @@ class InstructBlipEncoderLayer(GradientCheckpointingLayer): @auto_docstring class InstructBlipPreTrainedModel(PreTrainedModel): - config_class = InstructBlipConfig + config: InstructBlipConfig base_model_prefix = "blip" supports_gradient_checkpointing = True _supports_attention_backend = True @@ -452,7 +452,7 @@ class InstructBlipEncoder(nn.Module): # Copied from transformers.models.blip.modeling_blip.BlipVisionModel with Blip->InstructBlip, BLIP->INSTRUCTBLIP class InstructBlipVisionModel(InstructBlipPreTrainedModel): main_input_name = "pixel_values" - config_class = InstructBlipVisionConfig + config: InstructBlipVisionConfig def __init__(self, config: InstructBlipVisionConfig): super().__init__(config) @@ -1351,7 +1351,7 @@ class InstructBlipModel(InstructBlipPreTrainedModel): """ ) class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel, GenerationMixin): - config_class = InstructBlipConfig + config: InstructBlipConfig main_input_name = "pixel_values" _supports_static_cache = True diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py index 0f62721d67..cec9198253 100644 --- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py @@ -819,7 +819,7 @@ class InstructBlipVideoQFormerEmbeddings(nn.Module): @auto_docstring class InstructBlipVideoPreTrainedModel(PreTrainedModel): - config_class = InstructBlipVideoConfig + config: InstructBlipVideoConfig base_model_prefix = "blip" supports_gradient_checkpointing = True _supports_attention_backend = True @@ -858,7 +858,7 @@ class InstructBlipVideoPreTrainedModel(PreTrainedModel): class InstructBlipVideoVisionModel(InstructBlipVideoPreTrainedModel): main_input_name = "pixel_values" - config_class = InstructBlipVideoVisionConfig + config: InstructBlipVideoVisionConfig def __init__(self, config: InstructBlipVideoVisionConfig): super().__init__(config) @@ -1357,7 +1357,7 @@ class InstructBlipVideoModel(InstructBlipVideoPreTrainedModel): """ ) class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel, GenerationMixin): - config_class = InstructBlipVideoConfig + config: InstructBlipVideoConfig main_input_name = "pixel_values" _supports_static_cache = True diff --git a/src/transformers/models/internvl/modeling_internvl.py b/src/transformers/models/internvl/modeling_internvl.py index d5768ef4a8..983d16ef03 100644 --- a/src/transformers/models/internvl/modeling_internvl.py +++ b/src/transformers/models/internvl/modeling_internvl.py @@ -173,7 +173,7 @@ class InternVLVisionAttention(nn.Module): @auto_docstring class InternVLVisionPreTrainedModel(PreTrainedModel): - config_class = InternVLVisionConfig + config: InternVLVisionConfig base_model_prefix = "internvl_vision" main_input_name = "pixel_values" supports_gradient_checkpointing = True @@ -516,7 +516,7 @@ class InternVLVisionModel(InternVLVisionPreTrainedModel): @auto_docstring class InternVLPreTrainedModel(PreTrainedModel): - config_class = InternVLConfig + config: InternVLConfig base_model_prefix = "" supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/internvl/modular_internvl.py b/src/transformers/models/internvl/modular_internvl.py index 0e52b89872..45d8c3be3e 100644 --- a/src/transformers/models/internvl/modular_internvl.py +++ b/src/transformers/models/internvl/modular_internvl.py @@ -135,7 +135,7 @@ class InternVLVisionAttention(JanusVisionAttention): @auto_docstring class InternVLVisionPreTrainedModel(PreTrainedModel): - config_class = InternVLVisionConfig + config: InternVLVisionConfig base_model_prefix = "internvl_vision" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py index c7145cd825..52c5d7828a 100755 --- a/src/transformers/models/jamba/modeling_jamba.py +++ b/src/transformers/models/jamba/modeling_jamba.py @@ -1064,7 +1064,7 @@ class JambaMambaDecoderLayer(GradientCheckpointingLayer): @auto_docstring class JambaPreTrainedModel(PreTrainedModel): - config_class = JambaConfig + config: JambaConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["JambaAttentionDecoderLayer", "JambaMambaDecoderLayer"] diff --git a/src/transformers/models/janus/modeling_janus.py b/src/transformers/models/janus/modeling_janus.py index e85023a208..3cd578bc80 100644 --- a/src/transformers/models/janus/modeling_janus.py +++ b/src/transformers/models/janus/modeling_janus.py @@ -55,7 +55,7 @@ logger = logging.get_logger(__name__) @auto_docstring class JanusPreTrainedModel(PreTrainedModel): - config_class = JanusConfig + config: JanusConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["LlamaDecoderLayer", "JanusVisionEncoderLayer"] @@ -513,7 +513,7 @@ class JanusVisionEncoder(nn.Module): @auto_docstring class JanusVisionModel(JanusPreTrainedModel): main_input_name = "pixel_values" - config_class = JanusVisionConfig + config: JanusVisionConfig def __init__(self, config: JanusVisionConfig): super().__init__(config) @@ -932,7 +932,7 @@ class JanusVQVAEDecoder(nn.Module): """ ) class JanusVQVAE(JanusPreTrainedModel): - config_class = JanusVQVAEConfig + config: JanusVQVAEConfig _no_split_modules = [ "JanusVQVAEAttnBlock", "JanusVQVAEResnetBlock", diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py index 588500bae2..313b431329 100644 --- a/src/transformers/models/janus/modular_janus.py +++ b/src/transformers/models/janus/modular_janus.py @@ -382,7 +382,7 @@ class JanusConfig(PretrainedConfig): @auto_docstring class JanusPreTrainedModel(PreTrainedModel): - config_class = JanusConfig + config: JanusConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["LlamaDecoderLayer", "JanusVisionEncoderLayer"] diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py index c13437c5bb..224d35f97b 100644 --- a/src/transformers/models/jetmoe/modeling_jetmoe.py +++ b/src/transformers/models/jetmoe/modeling_jetmoe.py @@ -821,7 +821,7 @@ class JetMoeBlock(GradientCheckpointingLayer): @auto_docstring class JetMoePreTrainedModel(PreTrainedModel): - config_class = JetMoeConfig + config: JetMoeConfig base_model_prefix = "transformer" supports_gradient_checkpointing = False _no_split_modules = ["JetMoeBlock"] diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py index e3cbd12f73..2082d68f29 100644 --- a/src/transformers/models/kosmos2/modeling_kosmos2.py +++ b/src/transformers/models/kosmos2/modeling_kosmos2.py @@ -1149,7 +1149,7 @@ class Kosmos2TextTransformer(nn.Module): @auto_docstring class Kosmos2PreTrainedModel(PreTrainedModel): - config_class = Kosmos2Config + config: Kosmos2Config supports_gradient_checkpointing = True _no_split_modules = ["Kosmos2VisionEncoderLayer", "Kosmos2TextBlock"] _supports_attention_backend = True @@ -1241,7 +1241,7 @@ class Kosmos2PreTrainedModel(PreTrainedModel): class Kosmos2VisionModel(Kosmos2PreTrainedModel): - config_class = Kosmos2VisionConfig + config: Kosmos2VisionConfig main_input_name = "pixel_values" # Copied from transformers.models.clip.modeling_clip.CLIPVisionModel.__init__ with CLIP_VISION->KOSMOS2_VISION,CLIP->Kosmos2,self.vision_model->self.model @@ -1274,7 +1274,7 @@ class Kosmos2VisionModel(Kosmos2PreTrainedModel): class Kosmos2TextModel(Kosmos2PreTrainedModel): - config_class = Kosmos2TextConfig + config: Kosmos2TextConfig def __init__(self, config: Kosmos2TextConfig): super().__init__(config) @@ -1353,7 +1353,7 @@ class Kosmos2TextModel(Kosmos2PreTrainedModel): """ ) class Kosmos2TextForCausalLM(Kosmos2PreTrainedModel, GenerationMixin): - config_class = Kosmos2TextConfig + config: Kosmos2TextConfig _tied_weights_keys = ["lm_head.weight"] def __init__(self, config: Kosmos2TextConfig): @@ -1549,7 +1549,7 @@ class Kosmos2ImageToTextProjection(nn.Module): """ ) class Kosmos2Model(Kosmos2PreTrainedModel): - config_class = Kosmos2Config + config: Kosmos2Config main_input_name = "pixel_values" def __init__(self, config: Kosmos2Config): @@ -1708,7 +1708,7 @@ class Kosmos2Model(Kosmos2PreTrainedModel): """ ) class Kosmos2ForConditionalGeneration(Kosmos2PreTrainedModel, GenerationMixin): - config_class = Kosmos2Config + config: Kosmos2Config main_input_name = "pixel_values" _tied_weights_keys = ["text_model.lm_head.weight"] diff --git a/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py index 04b38cd10a..5892d76490 100644 --- a/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py @@ -113,7 +113,7 @@ class KyutaiSpeechToTextFlexibleLinear(nn.Module): @auto_docstring class KyutaiSpeechToTextPreTrainedModel(PreTrainedModel): - config_class = KyutaiSpeechToTextConfig + config: KyutaiSpeechToTextConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["KyutaiSpeechToTextDecoderLayer", "MimiTransformerLayer"] diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py index 6fd8fcc807..22f8745f5b 100644 --- a/src/transformers/models/layoutlm/modeling_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_layoutlm.py @@ -487,7 +487,7 @@ class LayoutLMOnlyMLMHead(nn.Module): @auto_docstring class LayoutLMPreTrainedModel(PreTrainedModel): - config_class = LayoutLMConfig + config: LayoutLMConfig base_model_prefix = "layoutlm" supports_gradient_checkpointing = True diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py index 7f6a861a67..11d8127ef6 100755 --- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py @@ -468,7 +468,7 @@ class LayoutLMv2Encoder(nn.Module): @auto_docstring class LayoutLMv2PreTrainedModel(PreTrainedModel): - config_class = LayoutLMv2Config + config: LayoutLMv2Config base_model_prefix = "layoutlmv2" def _init_weights(self, module): diff --git a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py index 8b56285410..bd8b525bb4 100644 --- a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py @@ -200,7 +200,7 @@ class LayoutLMv3TextEmbeddings(nn.Module): @auto_docstring class LayoutLMv3PreTrainedModel(PreTrainedModel): - config_class = LayoutLMv3Config + config: LayoutLMv3Config base_model_prefix = "layoutlmv3" def _init_weights(self, module): diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py index b665858862..ae0f361bd4 100755 --- a/src/transformers/models/led/modeling_led.py +++ b/src/transformers/models/led/modeling_led.py @@ -1107,7 +1107,7 @@ class LEDClassificationHead(nn.Module): @auto_docstring class LEDPreTrainedModel(PreTrainedModel): - config_class = LEDConfig + config: LEDConfig base_model_prefix = "led" supports_gradient_checkpointing = True diff --git a/src/transformers/models/levit/modeling_levit.py b/src/transformers/models/levit/modeling_levit.py index 623c8e2278..fc275a1c4c 100644 --- a/src/transformers/models/levit/modeling_levit.py +++ b/src/transformers/models/levit/modeling_levit.py @@ -468,7 +468,7 @@ class LevitClassificationLayer(nn.Module): @auto_docstring class LevitPreTrainedModel(PreTrainedModel): - config_class = LevitConfig + config: LevitConfig base_model_prefix = "levit" main_input_name = "pixel_values" _no_split_modules = ["LevitResidualLayer"] diff --git a/src/transformers/models/lfm2/modeling_lfm2.py b/src/transformers/models/lfm2/modeling_lfm2.py index dc58e8f2e1..99b2730507 100644 --- a/src/transformers/models/lfm2/modeling_lfm2.py +++ b/src/transformers/models/lfm2/modeling_lfm2.py @@ -534,7 +534,7 @@ class Lfm2DecoderLayer(GradientCheckpointingLayer): @auto_docstring class Lfm2PreTrainedModel(PreTrainedModel): - config_class = Lfm2Config + config: Lfm2Config base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["Lfm2DecoderLayer"] diff --git a/src/transformers/models/lightglue/modeling_lightglue.py b/src/transformers/models/lightglue/modeling_lightglue.py index 5b863a7a93..e635f13e33 100644 --- a/src/transformers/models/lightglue/modeling_lightglue.py +++ b/src/transformers/models/lightglue/modeling_lightglue.py @@ -419,7 +419,7 @@ class LightGluePreTrainedModel(PreTrainedModel): models. """ - config_class = LightGlueConfig + config: LightGlueConfig base_model_prefix = "lightglue" main_input_name = "pixel_values" supports_gradient_checkpointing = False diff --git a/src/transformers/models/lightglue/modular_lightglue.py b/src/transformers/models/lightglue/modular_lightglue.py index 544cad5c79..78caf28f15 100644 --- a/src/transformers/models/lightglue/modular_lightglue.py +++ b/src/transformers/models/lightglue/modular_lightglue.py @@ -504,7 +504,7 @@ class LightGluePreTrainedModel(PreTrainedModel): models. """ - config_class = LightGlueConfig + config: LightGlueConfig base_model_prefix = "lightglue" main_input_name = "pixel_values" supports_gradient_checkpointing = False diff --git a/src/transformers/models/lilt/modeling_lilt.py b/src/transformers/models/lilt/modeling_lilt.py index deddd105da..c3bcbf31f0 100644 --- a/src/transformers/models/lilt/modeling_lilt.py +++ b/src/transformers/models/lilt/modeling_lilt.py @@ -560,7 +560,7 @@ class LiltPooler(nn.Module): @auto_docstring class LiltPreTrainedModel(PreTrainedModel): - config_class = LiltConfig + config: LiltConfig base_model_prefix = "lilt" supports_gradient_checkpointing = True _no_split_modules = [] diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index c5a5756363..ee7c72aabb 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -306,7 +306,7 @@ class LlamaDecoderLayer(GradientCheckpointingLayer): @auto_docstring class LlamaPreTrainedModel(PreTrainedModel): - config_class = LlamaConfig + config: LlamaConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["LlamaDecoderLayer"] diff --git a/src/transformers/models/llama4/modeling_llama4.py b/src/transformers/models/llama4/modeling_llama4.py index 24fc14bc96..3be5760ae2 100644 --- a/src/transformers/models/llama4/modeling_llama4.py +++ b/src/transformers/models/llama4/modeling_llama4.py @@ -430,7 +430,7 @@ class Llama4TextDecoderLayer(GradientCheckpointingLayer): @auto_docstring class Llama4PreTrainedModel(PreTrainedModel): - config_class = Llama4Config + config: Llama4Config supports_gradient_checkpointing = True _skip_keys_device_placement = ["past_key_values"] _supports_flash_attn = False @@ -471,7 +471,7 @@ class Llama4PreTrainedModel(PreTrainedModel): class Llama4TextModel(Llama4PreTrainedModel): _no_split_modules = ["Llama4TextDecoderLayer"] base_model_prefix = "model" - config_class = Llama4TextConfig + config: Llama4TextConfig def __init__(self, config: Llama4TextConfig): super().__init__(config) @@ -608,7 +608,7 @@ class Llama4ForCausalLM(Llama4PreTrainedModel, GenerationMixin): base_model_prefix = "language_model" _tied_weights_keys = ["lm_head.weight"] _tp_plan = {"lm_head": "colwise_rep"} - config_class = Llama4TextConfig + config: Llama4TextConfig def __init__(self, config: Llama4TextConfig): super().__init__(config) @@ -1076,7 +1076,7 @@ class Llama4VisionRotaryEmbedding(nn.Module): class Llama4VisionModel(Llama4PreTrainedModel): base_model_prefix = "vision_model" _no_split_modules = ["Llama4VisionEncoderLayer"] - config_class = Llama4VisionConfig + config: Llama4VisionConfig def __init__(self, config: Llama4VisionConfig): super().__init__(config) @@ -1211,7 +1211,7 @@ class Llama4ForConditionalGeneration(Llama4PreTrainedModel, GenerationMixin): _no_split_modules = ["Llama4TextDecoderLayer", "Llama4VisionEncoderLayer"] _tp_plan = {} base_model_prefix = "" - config_class = Llama4Config + config: Llama4Config def __init__(self, config: Llama4Config): super().__init__(config) diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index 7cd79de12d..58331bdff0 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -113,7 +113,7 @@ class LlavaMultiModalProjector(nn.Module): @auto_docstring class LlavaPreTrainedModel(PreTrainedModel): - config_class = LlavaConfig + config: LlavaConfig base_model_prefix = "" supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index c019fb275c..03fa7015d3 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -223,7 +223,7 @@ class LlavaNextMultiModalProjector(nn.Module): @auto_docstring class LlavaNextPreTrainedModel(PreTrainedModel): - config_class = LlavaNextConfig + config: LlavaNextConfig base_model_prefix = "" supports_gradient_checkpointing = True _no_split_modules = ["LlamaDecoderLayer"] diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 2d73a51a28..c28a9c1565 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -164,7 +164,7 @@ class LlavaNextVideoMultiModalProjector(nn.Module): @auto_docstring class LlavaNextVideoPreTrainedModel(PreTrainedModel): - config_class = LlavaNextVideoConfig + config: LlavaNextVideoConfig base_model_prefix = "" supports_gradient_checkpointing = True _no_split_modules = ["LlamaDecoderLayer"] diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index af16955b41..2d07527f48 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -277,7 +277,7 @@ def unpad_image(tensor, original_size): @auto_docstring class LlavaOnevisionPreTrainedModel(PreTrainedModel): - config_class = LlavaOnevisionConfig + config: LlavaOnevisionConfig base_model_prefix = "" supports_gradient_checkpointing = True _no_split_modules = ["LlamaDecoderLayer"] diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py index 98390cc6f8..e2240c3843 100755 --- a/src/transformers/models/longformer/modeling_longformer.py +++ b/src/transformers/models/longformer/modeling_longformer.py @@ -1350,7 +1350,7 @@ class LongformerLMHead(nn.Module): @auto_docstring class LongformerPreTrainedModel(PreTrainedModel): - config_class = LongformerConfig + config: LongformerConfig base_model_prefix = "longformer" supports_gradient_checkpointing = True _no_split_modules = ["LongformerSelfAttention"] diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py index b0613a2ea5..871e75a129 100644 --- a/src/transformers/models/longt5/modeling_longt5.py +++ b/src/transformers/models/longt5/modeling_longt5.py @@ -1245,7 +1245,7 @@ class LongT5Block(GradientCheckpointingLayer): @auto_docstring class LongT5PreTrainedModel(PreTrainedModel): - config_class = LongT5Config + config: LongT5Config base_model_prefix = "transformer" supports_gradient_checkpointing = True _no_split_modules = ["LongT5Block"] diff --git a/src/transformers/models/luke/modeling_luke.py b/src/transformers/models/luke/modeling_luke.py index 65d998625d..ed1f2084c9 100644 --- a/src/transformers/models/luke/modeling_luke.py +++ b/src/transformers/models/luke/modeling_luke.py @@ -780,7 +780,7 @@ class EntityPredictionHead(nn.Module): @auto_docstring class LukePreTrainedModel(PreTrainedModel): - config_class = LukeConfig + config: LukeConfig base_model_prefix = "luke" supports_gradient_checkpointing = True _no_split_modules = ["LukeAttention", "LukeEntityEmbeddings"] diff --git a/src/transformers/models/lxmert/modeling_lxmert.py b/src/transformers/models/lxmert/modeling_lxmert.py index 4138cb0b82..00243ce123 100644 --- a/src/transformers/models/lxmert/modeling_lxmert.py +++ b/src/transformers/models/lxmert/modeling_lxmert.py @@ -759,7 +759,7 @@ class LxmertPreTrainingHeads(nn.Module): @auto_docstring class LxmertPreTrainedModel(PreTrainedModel): - config_class = LxmertConfig + config: LxmertConfig load_tf_weights = load_tf_weights_in_lxmert base_model_prefix = "lxmert" _supports_param_buffer_assignment = False diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py index cc663aa743..b45fb0e68b 100755 --- a/src/transformers/models/m2m_100/modeling_m2m_100.py +++ b/src/transformers/models/m2m_100/modeling_m2m_100.py @@ -516,7 +516,7 @@ class M2M100DecoderLayer(GradientCheckpointingLayer): @auto_docstring class M2M100PreTrainedModel(PreTrainedModel): - config_class = M2M100Config + config: M2M100Config base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["M2M100EncoderLayer", "M2M100DecoderLayer"] diff --git a/src/transformers/models/mamba/modeling_mamba.py b/src/transformers/models/mamba/modeling_mamba.py index 7da4ef5787..fd0e362e9f 100644 --- a/src/transformers/models/mamba/modeling_mamba.py +++ b/src/transformers/models/mamba/modeling_mamba.py @@ -374,7 +374,7 @@ class MambaBlock(GradientCheckpointingLayer): @auto_docstring class MambaPreTrainedModel(PreTrainedModel): - config_class = MambaConfig + config: MambaConfig base_model_prefix = "backbone" _no_split_modules = ["MambaBlock", "MambaMixer"] supports_gradient_checkpointing = True diff --git a/src/transformers/models/mamba2/modeling_mamba2.py b/src/transformers/models/mamba2/modeling_mamba2.py index e601b4d8a6..2511c1809d 100644 --- a/src/transformers/models/mamba2/modeling_mamba2.py +++ b/src/transformers/models/mamba2/modeling_mamba2.py @@ -713,7 +713,7 @@ class Mamba2Block(GradientCheckpointingLayer): @auto_docstring class Mamba2PreTrainedModel(PreTrainedModel): - config_class = Mamba2Config + config: Mamba2Config base_model_prefix = "backbone" _no_split_modules = ["Mamba2Block"] supports_gradient_checkpointing = True diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py index 94c913ad7a..052bbe3c1a 100755 --- a/src/transformers/models/marian/modeling_marian.py +++ b/src/transformers/models/marian/modeling_marian.py @@ -460,7 +460,7 @@ class MarianDecoderLayer(GradientCheckpointingLayer): @auto_docstring class MarianPreTrainedModel(PreTrainedModel): - config_class = MarianConfig + config: MarianConfig base_model_prefix = "model" supports_gradient_checkpointing = True _supports_flash_attn = True diff --git a/src/transformers/models/markuplm/modeling_markuplm.py b/src/transformers/models/markuplm/modeling_markuplm.py index 9fb5a7469b..3bf6553859 100755 --- a/src/transformers/models/markuplm/modeling_markuplm.py +++ b/src/transformers/models/markuplm/modeling_markuplm.py @@ -579,7 +579,7 @@ class MarkupLMEncoder(nn.Module): @auto_docstring class MarkupLMPreTrainedModel(PreTrainedModel): - config_class = MarkupLMConfig + config: MarkupLMConfig base_model_prefix = "markuplm" # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights with Bert->MarkupLM diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py index fbcf33f486..7f730bdb99 100644 --- a/src/transformers/models/mask2former/modeling_mask2former.py +++ b/src/transformers/models/mask2former/modeling_mask2former.py @@ -2085,7 +2085,7 @@ class Mask2FormerTransformerModule(nn.Module): @auto_docstring class Mask2FormerPreTrainedModel(PreTrainedModel): - config_class = Mask2FormerConfig + config: Mask2FormerConfig base_model_prefix = "model" main_input_name = "pixel_values" diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py index 8f1febefc8..3be1021a2c 100644 --- a/src/transformers/models/maskformer/modeling_maskformer.py +++ b/src/transformers/models/maskformer/modeling_maskformer.py @@ -1419,7 +1419,7 @@ class MaskFormerTransformerModule(nn.Module): @auto_docstring class MaskFormerPreTrainedModel(PreTrainedModel): - config_class = MaskFormerConfig + config: MaskFormerConfig base_model_prefix = "model" main_input_name = "pixel_values" diff --git a/src/transformers/models/maskformer/modeling_maskformer_swin.py b/src/transformers/models/maskformer/modeling_maskformer_swin.py index 6cfaf8d92e..22e91c0970 100644 --- a/src/transformers/models/maskformer/modeling_maskformer_swin.py +++ b/src/transformers/models/maskformer/modeling_maskformer_swin.py @@ -734,7 +734,7 @@ class MaskFormerSwinEncoder(nn.Module): @auto_docstring class MaskFormerSwinPreTrainedModel(PreTrainedModel): - config_class = MaskFormerSwinConfig + config: MaskFormerSwinConfig base_model_prefix = "model" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py index 7731018611..63b6ec0cb1 100755 --- a/src/transformers/models/mbart/modeling_mbart.py +++ b/src/transformers/models/mbart/modeling_mbart.py @@ -485,7 +485,7 @@ class MBartClassificationHead(nn.Module): @auto_docstring class MBartPreTrainedModel(PreTrainedModel): - config_class = MBartConfig + config: MBartConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["MBartDecoderLayer", "MBartEncoderLayer", "MBartAttention"] diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py index 7ed94107b7..d55734c267 100755 --- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py +++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py @@ -679,7 +679,7 @@ class MegatronBertPreTrainingHeads(nn.Module): @auto_docstring class MegatronBertPreTrainedModel(PreTrainedModel): - config_class = MegatronBertConfig + config: MegatronBertConfig load_tf_weights = load_tf_weights_in_megatron_bert base_model_prefix = "bert" supports_gradient_checkpointing = True diff --git a/src/transformers/models/mgp_str/modeling_mgp_str.py b/src/transformers/models/mgp_str/modeling_mgp_str.py index ef05462f87..27b9d0df9d 100644 --- a/src/transformers/models/mgp_str/modeling_mgp_str.py +++ b/src/transformers/models/mgp_str/modeling_mgp_str.py @@ -286,7 +286,7 @@ class MgpstrA3Module(nn.Module): @auto_docstring class MgpstrPreTrainedModel(PreTrainedModel): - config_class = MgpstrConfig + config: MgpstrConfig base_model_prefix = "mgp_str" _no_split_modules = [] @@ -357,7 +357,7 @@ class MgpstrModel(MgpstrPreTrainedModel): """ ) class MgpstrForSceneTextRecognition(MgpstrPreTrainedModel): - config_class = MgpstrConfig + config: MgpstrConfig main_input_name = "pixel_values" def __init__(self, config: MgpstrConfig) -> None: diff --git a/src/transformers/models/mimi/modeling_mimi.py b/src/transformers/models/mimi/modeling_mimi.py index 6afc4fdb91..0be377794d 100644 --- a/src/transformers/models/mimi/modeling_mimi.py +++ b/src/transformers/models/mimi/modeling_mimi.py @@ -1367,7 +1367,7 @@ class MimiSplitResidualVectorQuantizer(nn.Module): @auto_docstring class MimiPreTrainedModel(PreTrainedModel): - config_class = MimiConfig + config: MimiConfig base_model_prefix = "mimi" main_input_name = "input_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/minimax/modeling_minimax.py b/src/transformers/models/minimax/modeling_minimax.py index 057f8d8097..40ef35a3a6 100644 --- a/src/transformers/models/minimax/modeling_minimax.py +++ b/src/transformers/models/minimax/modeling_minimax.py @@ -580,7 +580,7 @@ class MiniMaxDecoderLayer(GradientCheckpointingLayer): @auto_docstring class MiniMaxPreTrainedModel(PreTrainedModel): - config_class = MiniMaxConfig + config: MiniMaxConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["MiniMaxDecoderLayer"] diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index a6fbfbb489..29727dc415 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -251,7 +251,7 @@ class MistralDecoderLayer(GradientCheckpointingLayer): @auto_docstring class MistralPreTrainedModel(PreTrainedModel): - config_class = MistralConfig + config: MistralConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["MistralDecoderLayer"] diff --git a/src/transformers/models/mistral3/modeling_mistral3.py b/src/transformers/models/mistral3/modeling_mistral3.py index 20e81a8404..769697ada0 100644 --- a/src/transformers/models/mistral3/modeling_mistral3.py +++ b/src/transformers/models/mistral3/modeling_mistral3.py @@ -178,7 +178,7 @@ class Mistral3ModelOutputWithPast(BaseModelOutputWithPast): @auto_docstring class Mistral3PreTrainedModel(PreTrainedModel): - config_class = Mistral3Config + config: Mistral3Config base_model_prefix = "" supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index cc78dfecd5..671bc0390a 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -380,7 +380,7 @@ class MixtralRotaryEmbedding(nn.Module): @auto_docstring class MixtralPreTrainedModel(PreTrainedModel): - config_class = MixtralConfig + config: MixtralConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["MixtralDecoderLayer"] diff --git a/src/transformers/models/mlcd/modeling_mlcd.py b/src/transformers/models/mlcd/modeling_mlcd.py index 12fd0c6830..28919dae1d 100644 --- a/src/transformers/models/mlcd/modeling_mlcd.py +++ b/src/transformers/models/mlcd/modeling_mlcd.py @@ -505,7 +505,7 @@ class MLCDVisionTransformer(nn.Module): @auto_docstring class MLCDPreTrainedModel(PreTrainedModel): - config_class = MLCDVisionConfig + config: MLCDVisionConfig base_model_prefix = "mlcd" supports_gradient_checkpointing = True _supports_flash_attn = True @@ -549,7 +549,7 @@ class MLCDPreTrainedModel(PreTrainedModel): """ ) class MLCDVisionModel(MLCDPreTrainedModel): - config_class = MLCDVisionConfig + config: MLCDVisionConfig main_input_name = "pixel_values" _no_split_modules = ["MLCDEncoderLayer"] diff --git a/src/transformers/models/mlcd/modular_mlcd.py b/src/transformers/models/mlcd/modular_mlcd.py index a640ed0b59..fcc18ab2b1 100644 --- a/src/transformers/models/mlcd/modular_mlcd.py +++ b/src/transformers/models/mlcd/modular_mlcd.py @@ -439,7 +439,7 @@ class MLCDVisionTransformer(CLIPVisionTransformer): @auto_docstring class MLCDPreTrainedModel(PreTrainedModel): - config_class = MLCDVisionConfig + config: MLCDVisionConfig base_model_prefix = "mlcd" supports_gradient_checkpointing = True _supports_flash_attn = True diff --git a/src/transformers/models/mllama/modeling_mllama.py b/src/transformers/models/mllama/modeling_mllama.py index fcfa4a3db7..b520b75001 100644 --- a/src/transformers/models/mllama/modeling_mllama.py +++ b/src/transformers/models/mllama/modeling_mllama.py @@ -841,7 +841,7 @@ class MllamaRotaryEmbedding(nn.Module): @auto_docstring class MllamaPreTrainedModel(PreTrainedModel): - config_class = MllamaConfig + config: MllamaConfig base_model_prefix = "" supports_gradient_checkpointing = True _no_split_modules = [ @@ -1019,7 +1019,7 @@ class MllamaPreTrainedModel(PreTrainedModel): """ ) class MllamaVisionModel(MllamaPreTrainedModel): - config_class = MllamaVisionConfig + config: MllamaVisionConfig base_model_prefix = "vision_model" def __init__(self, config: MllamaVisionConfig): @@ -1250,7 +1250,7 @@ class MllamaVisionModel(MllamaPreTrainedModel): """ ) class MllamaTextModel(MllamaPreTrainedModel): - config_class = MllamaTextConfig + config: MllamaTextConfig base_model_prefix = "language_model.model" def __init__(self, config: MllamaTextConfig): @@ -1454,7 +1454,7 @@ class MllamaTextModel(MllamaPreTrainedModel): """ ) class MllamaForCausalLM(MllamaPreTrainedModel, GenerationMixin): - config_class = MllamaTextConfig + config: MllamaTextConfig _supports_static_cache = True # only the LLM without cross attn can do compile base_model_prefix = "language_model" _tied_weights_keys = ["lm_head.weight"] diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py index 91508d0997..9976868500 100644 --- a/src/transformers/models/mobilebert/modeling_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_mobilebert.py @@ -658,7 +658,7 @@ class MobileBertPreTrainingHeads(nn.Module): @auto_docstring class MobileBertPreTrainedModel(PreTrainedModel): - config_class = MobileBertConfig + config: MobileBertConfig load_tf_weights = load_tf_weights_in_mobilebert base_model_prefix = "mobilebert" diff --git a/src/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py b/src/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py index b77dd75ebf..47b9f43e8a 100755 --- a/src/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +++ b/src/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py @@ -229,7 +229,7 @@ class MobileNetV1ConvLayer(nn.Module): @auto_docstring class MobileNetV1PreTrainedModel(PreTrainedModel): - config_class = MobileNetV1Config + config: MobileNetV1Config load_tf_weights = load_tf_weights_in_mobilenet_v1 base_model_prefix = "mobilenet_v1" main_input_name = "pixel_values" diff --git a/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py index cc3173896c..fa213ab9d9 100755 --- a/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +++ b/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py @@ -422,7 +422,7 @@ class MobileNetV2Stem(nn.Module): @auto_docstring class MobileNetV2PreTrainedModel(PreTrainedModel): - config_class = MobileNetV2Config + config: MobileNetV2Config load_tf_weights = load_tf_weights_in_mobilenet_v2 base_model_prefix = "mobilenet_v2" main_input_name = "pixel_values" diff --git a/src/transformers/models/mobilevit/modeling_mobilevit.py b/src/transformers/models/mobilevit/modeling_mobilevit.py index 3f882b9850..ce6c67e1ad 100755 --- a/src/transformers/models/mobilevit/modeling_mobilevit.py +++ b/src/transformers/models/mobilevit/modeling_mobilevit.py @@ -623,7 +623,7 @@ class MobileViTEncoder(nn.Module): @auto_docstring class MobileViTPreTrainedModel(PreTrainedModel): - config_class = MobileViTConfig + config: MobileViTConfig base_model_prefix = "mobilevit" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py b/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py index a52aedca7c..f37f37c605 100644 --- a/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py +++ b/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py @@ -571,7 +571,7 @@ class MobileViTV2Encoder(nn.Module): @auto_docstring # Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTPreTrainedModel with MobileViT->MobileViTV2,mobilevit->mobilevitv2 class MobileViTV2PreTrainedModel(PreTrainedModel): - config_class = MobileViTV2Config + config: MobileViTV2Config base_model_prefix = "mobilevitv2" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/modernbert/modeling_modernbert.py b/src/transformers/models/modernbert/modeling_modernbert.py index e6d6c3e712..9b26635835 100644 --- a/src/transformers/models/modernbert/modeling_modernbert.py +++ b/src/transformers/models/modernbert/modeling_modernbert.py @@ -556,7 +556,7 @@ class ModernBertEncoderLayer(GradientCheckpointingLayer): @auto_docstring class ModernBertPreTrainedModel(PreTrainedModel): - config_class = ModernBertConfig + config: ModernBertConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["ModernBertEmbeddings", "ModernBertEncoderLayer"] diff --git a/src/transformers/models/modernbert/modular_modernbert.py b/src/transformers/models/modernbert/modular_modernbert.py index 32e694d7d5..3e4041bd8b 100644 --- a/src/transformers/models/modernbert/modular_modernbert.py +++ b/src/transformers/models/modernbert/modular_modernbert.py @@ -756,7 +756,7 @@ class ModernBertEncoderLayer(GradientCheckpointingLayer): @auto_docstring class ModernBertPreTrainedModel(PreTrainedModel): - config_class = ModernBertConfig + config: ModernBertConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["ModernBertEmbeddings", "ModernBertEncoderLayer"] diff --git a/src/transformers/models/modernbert_decoder/modeling_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/modeling_modernbert_decoder.py index 011db51daa..56b53446fd 100644 --- a/src/transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/modeling_modernbert_decoder.py @@ -217,7 +217,7 @@ class ModernBertDecoderLayer(GradientCheckpointingLayer): @auto_docstring class ModernBertDecoderPreTrainedModel(ModernBertPreTrainedModel): - config_class = ModernBertDecoderConfig + config_class: ModernBertDecoderConfig base_model_prefix = "model" _skip_keys_device_placement = ["past_key_values"] _no_split_modules = ["ModernBertDecoderLayer"] diff --git a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py index 7609c2f1fe..f82fb1573a 100644 --- a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py @@ -394,7 +394,7 @@ class ModernBertDecoderLayer(GradientCheckpointingLayer): @auto_docstring class ModernBertDecoderPreTrainedModel(ModernBertPreTrainedModel): - config_class = ModernBertDecoderConfig + config_class: ModernBertDecoderConfig base_model_prefix = "model" _skip_keys_device_placement = ["past_key_values"] _no_split_modules = ["ModernBertDecoderLayer"] diff --git a/src/transformers/models/moonshine/modeling_moonshine.py b/src/transformers/models/moonshine/modeling_moonshine.py index 9c2641d978..61ab1cbcc5 100644 --- a/src/transformers/models/moonshine/modeling_moonshine.py +++ b/src/transformers/models/moonshine/modeling_moonshine.py @@ -454,7 +454,7 @@ class MoonshineDecoderLayer(GradientCheckpointingLayer): @auto_docstring class MoonshinePreTrainedModel(PreTrainedModel): - config_class = MoonshineConfig + config: MoonshineConfig base_model_prefix = "model" main_input_name = "input_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py index 4e2882fb81..9706d99d7c 100644 --- a/src/transformers/models/moonshine/modular_moonshine.py +++ b/src/transformers/models/moonshine/modular_moonshine.py @@ -489,7 +489,7 @@ class MoonshineDecoderLayer(GradientCheckpointingLayer): @auto_docstring class MoonshinePreTrainedModel(PreTrainedModel): - config_class = MoonshineConfig + config: MoonshineConfig base_model_prefix = "model" main_input_name = "input_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/moshi/modeling_moshi.py b/src/transformers/models/moshi/modeling_moshi.py index 4cde9816bf..94a3a1fa1f 100644 --- a/src/transformers/models/moshi/modeling_moshi.py +++ b/src/transformers/models/moshi/modeling_moshi.py @@ -801,7 +801,7 @@ class MoshiDecoderLayer(GradientCheckpointingLayer): @auto_docstring class MoshiPreTrainedModel(PreTrainedModel): - config_class = MoshiConfig + config: MoshiConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["MoshiDecoderLayer", "MimiTransformerLayer"] @@ -835,7 +835,7 @@ class MoshiDepthDecoder(MoshiPreTrainedModel, GenerationMixin): config: MoshiConfig """ - config_class = MoshiDepthConfig + config: MoshiDepthConfig def __init__(self, config: MoshiDepthConfig): super().__init__(config) @@ -1628,7 +1628,7 @@ class MoshiForCausalLM(MoshiPreTrainedModel, GenerationMixin): ) class MoshiForConditionalGeneration(MoshiPreTrainedModel, GenerationMixin): _tied_weights_keys = ["decoder.model.embed_tokens.weight", "decoder.lm_head.weight"] - config_class = MoshiConfig + config: MoshiConfig main_input_name = "input_ids" supports_gradient_checkpointing = True _supports_flash_attn = True diff --git a/src/transformers/models/mpnet/modeling_mpnet.py b/src/transformers/models/mpnet/modeling_mpnet.py index 82698f8ecf..b25e549173 100644 --- a/src/transformers/models/mpnet/modeling_mpnet.py +++ b/src/transformers/models/mpnet/modeling_mpnet.py @@ -43,7 +43,7 @@ logger = logging.get_logger(__name__) @auto_docstring class MPNetPreTrainedModel(PreTrainedModel): - config_class = MPNetConfig + config: MPNetConfig base_model_prefix = "mpnet" def _init_weights(self, module): diff --git a/src/transformers/models/mpt/modeling_mpt.py b/src/transformers/models/mpt/modeling_mpt.py index 81680bef79..706318cc79 100644 --- a/src/transformers/models/mpt/modeling_mpt.py +++ b/src/transformers/models/mpt/modeling_mpt.py @@ -219,7 +219,7 @@ class MptBlock(GradientCheckpointingLayer): @auto_docstring class MptPreTrainedModel(PreTrainedModel): - config_class = MptConfig + config: MptConfig base_model_prefix = "transformer" supports_gradient_checkpointing = True _no_split_modules = ["MptBlock"] diff --git a/src/transformers/models/mra/modeling_mra.py b/src/transformers/models/mra/modeling_mra.py index 159299aa30..602cf53cc0 100644 --- a/src/transformers/models/mra/modeling_mra.py +++ b/src/transformers/models/mra/modeling_mra.py @@ -817,7 +817,7 @@ class MraOnlyMLMHead(nn.Module): @auto_docstring # Copied from transformers.models.yoso.modeling_yoso.YosoPreTrainedModel with Yoso->Mra,yoso->mra class MraPreTrainedModel(PreTrainedModel): - config_class = MraConfig + config: MraConfig base_model_prefix = "mra" supports_gradient_checkpointing = True diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py index 87dcd1d41f..f30bc3073e 100644 --- a/src/transformers/models/mt5/modeling_mt5.py +++ b/src/transformers/models/mt5/modeling_mt5.py @@ -752,7 +752,7 @@ class MT5ClassificationHead(nn.Module): @auto_docstring # Copied from transformers.models.t5.modeling_t5.T5PreTrainedModel with T5->MT5, t5->mt5 class MT5PreTrainedModel(PreTrainedModel): - config_class = MT5Config + config: MT5Config load_tf_weights = load_tf_weights_in_mt5 base_model_prefix = "transformer" is_parallelizable = True @@ -1297,7 +1297,7 @@ class MT5Model(MT5PreTrainedModel): ```""" model_type = "mt5" - config_class = MT5Config + config: MT5Config _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] @@ -1564,7 +1564,7 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel, GenerationMixin): ```""" model_type = "mt5" - config_class = MT5Config + config: MT5Config _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] @@ -1874,7 +1874,7 @@ class MT5EncoderModel(MT5PreTrainedModel): ```""" model_type = "mt5" - config_class = MT5Config + config: MT5Config _tied_weights_keys = ["encoder.embed_tokens.weight"] # Copied from transformers.models.t5.modeling_t5.T5EncoderModel.__init__ with T5->MT5 diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py index b7f092ea64..c2e8e430c7 100644 --- a/src/transformers/models/musicgen/modeling_musicgen.py +++ b/src/transformers/models/musicgen/modeling_musicgen.py @@ -419,7 +419,7 @@ class MusicgenDecoderLayer(GradientCheckpointingLayer): @auto_docstring class MusicgenPreTrainedModel(PreTrainedModel): - config_class = MusicgenDecoderConfig + config: MusicgenDecoderConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["MusicgenDecoderLayer", "MusicgenAttention"] @@ -1340,7 +1340,7 @@ class MusicgenForCausalLM(MusicgenPreTrainedModel, GenerationMixin): """ ) class MusicgenForConditionalGeneration(PreTrainedModel, GenerationMixin): - config_class = MusicgenConfig + config: MusicgenConfig base_model_prefix = "encoder_decoder" main_input_name = "input_ids" supports_gradient_checkpointing = True diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py index e415967b0a..fce8bea6b4 100644 --- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py +++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py @@ -384,7 +384,7 @@ class MusicgenMelodyDecoderLayer(GradientCheckpointingLayer): @auto_docstring # Copied from transformers.models.musicgen.modeling_musicgen.MusicgenPreTrainedModel with Musicgen->MusicgenMelody class MusicgenMelodyPreTrainedModel(PreTrainedModel): - config_class = MusicgenMelodyDecoderConfig + config: MusicgenMelodyDecoderConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["MusicgenMelodyDecoderLayer", "MusicgenMelodyAttention"] @@ -1269,7 +1269,7 @@ class MusicgenMelodyForCausalLM(MusicgenMelodyPreTrainedModel, GenerationMixin): @auto_docstring class MusicgenMelodyForConditionalGeneration(PreTrainedModel, GenerationMixin): - config_class = MusicgenMelodyConfig + config: MusicgenMelodyConfig main_input_name = "input_ids" supports_gradient_checkpointing = True _supports_flash_attn = True diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py index 1223d23fba..fd8f19eccc 100644 --- a/src/transformers/models/mvp/modeling_mvp.py +++ b/src/transformers/models/mvp/modeling_mvp.py @@ -486,7 +486,7 @@ class MvpPrompt(nn.Module): @auto_docstring class MvpPreTrainedModel(PreTrainedModel): - config_class = MvpConfig + config: MvpConfig base_model_prefix = "model" supports_gradient_checkpointing = True diff --git a/src/transformers/models/nemotron/modeling_nemotron.py b/src/transformers/models/nemotron/modeling_nemotron.py index 745f88b887..8da4a3bde8 100644 --- a/src/transformers/models/nemotron/modeling_nemotron.py +++ b/src/transformers/models/nemotron/modeling_nemotron.py @@ -581,7 +581,7 @@ class NemotronDecoderLayer(GradientCheckpointingLayer): @auto_docstring class NemotronPreTrainedModel(PreTrainedModel): - config_class = NemotronConfig + config: NemotronConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["NemotronDecoderLayer"] diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py index ea58413763..5537d8c128 100644 --- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py +++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py @@ -839,7 +839,7 @@ class NllbMoeDecoderLayer(GradientCheckpointingLayer): @auto_docstring class NllbMoePreTrainedModel(PreTrainedModel): - config_class = NllbMoeConfig + config: NllbMoeConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["NllbMoeEncoderLayer", "NllbMoeDecoderLayer"] diff --git a/src/transformers/models/nystromformer/modeling_nystromformer.py b/src/transformers/models/nystromformer/modeling_nystromformer.py index babd8acc09..45e69b6b46 100755 --- a/src/transformers/models/nystromformer/modeling_nystromformer.py +++ b/src/transformers/models/nystromformer/modeling_nystromformer.py @@ -443,7 +443,7 @@ class NystromformerOnlyMLMHead(nn.Module): @auto_docstring class NystromformerPreTrainedModel(PreTrainedModel): - config_class = NystromformerConfig + config: NystromformerConfig base_model_prefix = "nystromformer" supports_gradient_checkpointing = True diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py index 77e41e2d62..ef66f62236 100644 --- a/src/transformers/models/olmo/modeling_olmo.py +++ b/src/transformers/models/olmo/modeling_olmo.py @@ -285,7 +285,7 @@ class OlmoRotaryEmbedding(nn.Module): @auto_docstring class OlmoPreTrainedModel(PreTrainedModel): - config_class = OlmoConfig + config: OlmoConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["OlmoDecoderLayer"] diff --git a/src/transformers/models/olmo2/modeling_olmo2.py b/src/transformers/models/olmo2/modeling_olmo2.py index 5dd56e2edd..fde7f95adf 100644 --- a/src/transformers/models/olmo2/modeling_olmo2.py +++ b/src/transformers/models/olmo2/modeling_olmo2.py @@ -290,7 +290,7 @@ class Olmo2RotaryEmbedding(nn.Module): @auto_docstring class Olmo2PreTrainedModel(PreTrainedModel): - config_class = Olmo2Config + config: Olmo2Config base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["Olmo2DecoderLayer"] diff --git a/src/transformers/models/olmoe/modeling_olmoe.py b/src/transformers/models/olmoe/modeling_olmoe.py index 6c491754d4..89ebcb2013 100644 --- a/src/transformers/models/olmoe/modeling_olmoe.py +++ b/src/transformers/models/olmoe/modeling_olmoe.py @@ -698,7 +698,7 @@ class OlmoeDecoderLayer(GradientCheckpointingLayer): @auto_docstring class OlmoePreTrainedModel(PreTrainedModel): - config_class = OlmoeConfig + config: OlmoeConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["OlmoeDecoderLayer"] diff --git a/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py b/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py index ab1ae0b974..8256f070df 100644 --- a/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py +++ b/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py @@ -982,7 +982,7 @@ class OmDetTurboDeformableTransformerDecoderLayer(GradientCheckpointingLayer): @auto_docstring class OmDetTurboPreTrainedModel(PreTrainedModel): - config_class = OmDetTurboConfig + config: OmDetTurboConfig base_model_prefix = "model" main_input_name = "pixel_values" diff --git a/src/transformers/models/oneformer/modeling_oneformer.py b/src/transformers/models/oneformer/modeling_oneformer.py index 28eadd3a48..53160467c7 100644 --- a/src/transformers/models/oneformer/modeling_oneformer.py +++ b/src/transformers/models/oneformer/modeling_oneformer.py @@ -2757,7 +2757,7 @@ class OneFormerTaskModel(nn.Module): @auto_docstring class OneFormerPreTrainedModel(PreTrainedModel): - config_class = OneFormerConfig + config: OneFormerConfig base_model_prefix = "model" main_input_name = "pixel_values" diff --git a/src/transformers/models/openai/modeling_openai.py b/src/transformers/models/openai/modeling_openai.py index fd83b02b90..074af9ce11 100644 --- a/src/transformers/models/openai/modeling_openai.py +++ b/src/transformers/models/openai/modeling_openai.py @@ -358,7 +358,7 @@ class OpenAIGPTSequenceSummary(nn.Module): @auto_docstring class OpenAIGPTPreTrainedModel(PreTrainedModel): - config_class = OpenAIGPTConfig + config: OpenAIGPTConfig load_tf_weights = load_tf_weights_in_openai_gpt base_model_prefix = "transformer" diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py index b6641f4820..e275168017 100644 --- a/src/transformers/models/opt/modeling_opt.py +++ b/src/transformers/models/opt/modeling_opt.py @@ -304,7 +304,7 @@ class OPTDecoderLayer(GradientCheckpointingLayer): @auto_docstring class OPTPreTrainedModel(PreTrainedModel): - config_class = OPTConfig + config: OPTConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["OPTDecoderLayer"] diff --git a/src/transformers/models/owlv2/modeling_owlv2.py b/src/transformers/models/owlv2/modeling_owlv2.py index 8940943851..6294e58d69 100644 --- a/src/transformers/models/owlv2/modeling_owlv2.py +++ b/src/transformers/models/owlv2/modeling_owlv2.py @@ -555,7 +555,7 @@ class Owlv2EncoderLayer(GradientCheckpointingLayer): @auto_docstring # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTPreTrainedModel with OwlViT->Owlv2,owlvit->owlv2 class Owlv2PreTrainedModel(PreTrainedModel): - config_class = Owlv2Config + config: Owlv2Config base_model_prefix = "owlv2" supports_gradient_checkpointing = True _no_split_modules = ["Owlv2EncoderLayer"] @@ -761,7 +761,7 @@ class Owlv2TextTransformer(nn.Module): # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTTextModel with google/owlvit-base-patch32->google/owlv2-base-patch16, OWLVIT->OWLV2,OwlViT->Owlv2 class Owlv2TextModel(Owlv2PreTrainedModel): - config_class = Owlv2TextConfig + config: Owlv2TextConfig def __init__(self, config: Owlv2TextConfig): super().__init__(config) @@ -872,7 +872,7 @@ class Owlv2VisionTransformer(nn.Module): # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTVisionModel with OWLVIT->OWLV2,OwlViT->Owlv2,google/owlvit-base-patch32->google/owlv2-base-patch16 class Owlv2VisionModel(Owlv2PreTrainedModel): - config_class = Owlv2VisionConfig + config: Owlv2VisionConfig main_input_name = "pixel_values" def __init__(self, config: Owlv2VisionConfig): @@ -923,7 +923,7 @@ class Owlv2VisionModel(Owlv2PreTrainedModel): @auto_docstring # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTModel with google/owlvit-base-patch32->google/owlv2-base-patch16-ensemble, OWLVIT->OWLV2,OwlViT->Owlv2,owlvit->owlv2,OWL-ViT->OWLv2 class Owlv2Model(Owlv2PreTrainedModel): - config_class = Owlv2Config + config: Owlv2Config def __init__(self, config: Owlv2Config): super().__init__(config) @@ -1208,7 +1208,7 @@ class Owlv2ClassPredictionHead(nn.Module): class Owlv2ForObjectDetection(Owlv2PreTrainedModel): - config_class = Owlv2Config + config: Owlv2Config def __init__(self, config: Owlv2Config): super().__init__(config) diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py index f93a6836f3..da914d48b1 100644 --- a/src/transformers/models/owlvit/modeling_owlvit.py +++ b/src/transformers/models/owlvit/modeling_owlvit.py @@ -542,7 +542,7 @@ class OwlViTEncoderLayer(GradientCheckpointingLayer): @auto_docstring class OwlViTPreTrainedModel(PreTrainedModel): - config_class = OwlViTConfig + config: OwlViTConfig base_model_prefix = "owlvit" supports_gradient_checkpointing = True _no_split_modules = ["OwlViTEncoderLayer"] @@ -745,7 +745,7 @@ class OwlViTTextTransformer(nn.Module): class OwlViTTextModel(OwlViTPreTrainedModel): - config_class = OwlViTTextConfig + config: OwlViTTextConfig def __init__(self, config: OwlViTTextConfig): super().__init__(config) @@ -854,7 +854,7 @@ class OwlViTVisionTransformer(nn.Module): class OwlViTVisionModel(OwlViTPreTrainedModel): - config_class = OwlViTVisionConfig + config: OwlViTVisionConfig main_input_name = "pixel_values" def __init__(self, config: OwlViTVisionConfig): @@ -904,7 +904,7 @@ class OwlViTVisionModel(OwlViTPreTrainedModel): @auto_docstring class OwlViTModel(OwlViTPreTrainedModel): - config_class = OwlViTConfig + config: OwlViTConfig def __init__(self, config: OwlViTConfig): super().__init__(config) @@ -1187,7 +1187,7 @@ class OwlViTClassPredictionHead(nn.Module): class OwlViTForObjectDetection(OwlViTPreTrainedModel): - config_class = OwlViTConfig + config: OwlViTConfig def __init__(self, config: OwlViTConfig): super().__init__(config) diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py index 5ab2b93e32..f10ece1c19 100644 --- a/src/transformers/models/paligemma/modeling_paligemma.py +++ b/src/transformers/models/paligemma/modeling_paligemma.py @@ -108,7 +108,7 @@ class PaliGemmaMultiModalProjector(nn.Module): @auto_docstring class PaliGemmaPreTrainedModel(PreTrainedModel): - config_class = PaliGemmaConfig + config: PaliGemmaConfig base_model_prefix = "" supports_gradient_checkpointing = True _no_split_modules = ["PaliGemmaMultiModalProjector"] diff --git a/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py b/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py index bf2dc59d24..981575f42a 100644 --- a/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py +++ b/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py @@ -683,7 +683,7 @@ class PatchTSMixerLinearHead(nn.Module): @auto_docstring class PatchTSMixerPreTrainedModel(PreTrainedModel): # Weight initialization - config_class = PatchTSMixerConfig + config: PatchTSMixerConfig base_model_prefix = "model" main_input_name = "past_values" supports_gradient_checkpointing = False diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index c613fd8955..559cf7df61 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -553,7 +553,7 @@ class PatchTSTEncoderLayer(nn.Module): @auto_docstring class PatchTSTPreTrainedModel(PreTrainedModel): - config_class = PatchTSTConfig + config: PatchTSTConfig base_model_prefix = "model" main_input_name = "past_values" supports_gradient_checkpointing = False diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py index 1eec8b4166..f3c5bc8a4f 100755 --- a/src/transformers/models/pegasus/modeling_pegasus.py +++ b/src/transformers/models/pegasus/modeling_pegasus.py @@ -451,7 +451,7 @@ class PegasusDecoderLayer(GradientCheckpointingLayer): @auto_docstring class PegasusPreTrainedModel(PreTrainedModel): - config_class = PegasusConfig + config: PegasusConfig base_model_prefix = "model" supports_gradient_checkpointing = True _supports_flash_attn = True diff --git a/src/transformers/models/pegasus_x/modeling_pegasus_x.py b/src/transformers/models/pegasus_x/modeling_pegasus_x.py index a371be87de..46d3730695 100755 --- a/src/transformers/models/pegasus_x/modeling_pegasus_x.py +++ b/src/transformers/models/pegasus_x/modeling_pegasus_x.py @@ -749,7 +749,7 @@ class PegasusXDecoderLayer(GradientCheckpointingLayer): @auto_docstring class PegasusXPreTrainedModel(PreTrainedModel): - config_class = PegasusXConfig + config: PegasusXConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = [r"PegasusXEncoderLayer", r"PegasusXDecoderLayer"] diff --git a/src/transformers/models/perceiver/modeling_perceiver.py b/src/transformers/models/perceiver/modeling_perceiver.py index bfa1fd456f..bd3a2d9e8c 100755 --- a/src/transformers/models/perceiver/modeling_perceiver.py +++ b/src/transformers/models/perceiver/modeling_perceiver.py @@ -560,7 +560,7 @@ class PerceiverEncoder(nn.Module): @auto_docstring class PerceiverPreTrainedModel(PreTrainedModel): - config_class = PerceiverConfig + config: PerceiverConfig base_model_prefix = "perceiver" main_input_name = "inputs" diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py index da244141c7..d646569ff0 100644 --- a/src/transformers/models/perception_lm/modeling_perception_lm.py +++ b/src/transformers/models/perception_lm/modeling_perception_lm.py @@ -87,7 +87,7 @@ class PerceptionLMMultiModalProjector(nn.Module): @auto_docstring class PerceptionLMPreTrainedModel(PreTrainedModel): - config_class = PerceptionLMConfig + config: PerceptionLMConfig base_model_prefix = "model" supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py index ef69edc187..cb3313753e 100644 --- a/src/transformers/models/persimmon/modeling_persimmon.py +++ b/src/transformers/models/persimmon/modeling_persimmon.py @@ -383,7 +383,7 @@ class PersimmonDecoderLayer(GradientCheckpointingLayer): @auto_docstring class PersimmonPreTrainedModel(PreTrainedModel): - config_class = PersimmonConfig + config: PersimmonConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["PersimmonDecoderLayer"] diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py index d08d73d87a..2ca954635b 100644 --- a/src/transformers/models/phi/modeling_phi.py +++ b/src/transformers/models/phi/modeling_phi.py @@ -290,7 +290,7 @@ class PhiRotaryEmbedding(nn.Module): @auto_docstring class PhiPreTrainedModel(PreTrainedModel): - config_class = PhiConfig + config: PhiConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["PhiDecoderLayer"] diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index fb92b54105..35efc8d3db 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -282,7 +282,7 @@ class Phi3DecoderLayer(GradientCheckpointingLayer): @auto_docstring class Phi3PreTrainedModel(PreTrainedModel): - config_class = Phi3Config + config: Phi3Config base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["Phi3DecoderLayer"] diff --git a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py index 58008f692b..301bd5a846 100644 --- a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py @@ -365,7 +365,7 @@ def default_flax_embed_init(tensor): @auto_docstring class Phi4MultimodalVisionPreTrainedModel(PreTrainedModel): - config_class = Phi4MultimodalVisionConfig + config: Phi4MultimodalVisionConfig base_model_prefix = "phi4_vision" supports_gradient_checkpointing = True @@ -524,7 +524,7 @@ class Phi4MultimodalVisionMultiheadAttentionPoolingHead(nn.Module): class Phi4MultimodalVisionModel(Phi4MultimodalVisionPreTrainedModel): - config_class = Phi4MultimodalVisionConfig + config: Phi4MultimodalVisionConfig main_input_name = "pixel_values" def __init__(self, config: Phi4MultimodalVisionConfig): @@ -992,7 +992,7 @@ class Phi4MultimodalAudioMeanVarianceNormLayer(nn.Module): @auto_docstring class Phi4MultimodalAudioPreTrainedModel(PreTrainedModel): - config_class = Phi4MultimodalAudioConfig + config: Phi4MultimodalAudioConfig supports_gradient_checkpointing = True _no_split_modules = ["Phi4MultimodalAudioConformerEncoderLayer"] _supports_flash_attn = True @@ -1584,7 +1584,7 @@ class Phi4MultimodalRotaryEmbedding(nn.Module): @auto_docstring class Phi4MultimodalPreTrainedModel(PreTrainedModel): - config_class = Phi4MultimodalConfig + config: Phi4MultimodalConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["Phi4MultimodalDecoderLayer"] diff --git a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py index d8c7c03b76..fefe5d69ab 100644 --- a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py @@ -535,7 +535,7 @@ class Phi4MultimodalVisionEncoder(SiglipEncoder): class Phi4MultimodalVisionPreTrainedModel(SiglipPreTrainedModel): - config_class = Phi4MultimodalVisionConfig + config: Phi4MultimodalVisionConfig base_model_prefix = "phi4_vision" supports_gradient_checkpointing = True @@ -649,7 +649,7 @@ class Phi4MultimodalVisionMultiheadAttentionPoolingHead(SiglipMultiheadAttention class Phi4MultimodalVisionModel(Phi4MultimodalVisionPreTrainedModel): - config_class = Phi4MultimodalVisionConfig + config: Phi4MultimodalVisionConfig main_input_name = "pixel_values" def __init__(self, config: Phi4MultimodalVisionConfig): @@ -1117,7 +1117,7 @@ class Phi4MultimodalAudioMeanVarianceNormLayer(nn.Module): @auto_docstring class Phi4MultimodalAudioPreTrainedModel(PreTrainedModel): - config_class = Phi4MultimodalAudioConfig + config: Phi4MultimodalAudioConfig supports_gradient_checkpointing = True _no_split_modules = ["Phi4MultimodalAudioConformerEncoderLayer"] _supports_flash_attn = True diff --git a/src/transformers/models/phimoe/modeling_phimoe.py b/src/transformers/models/phimoe/modeling_phimoe.py index df2978f123..4f8a9f2d28 100644 --- a/src/transformers/models/phimoe/modeling_phimoe.py +++ b/src/transformers/models/phimoe/modeling_phimoe.py @@ -882,7 +882,7 @@ class PhimoeDecoderLayer(GradientCheckpointingLayer): @auto_docstring class PhimoePreTrainedModel(PreTrainedModel): - config_class = PhimoeConfig + config: PhimoeConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["PhimoeDecoderLayer"] diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py index 900660aa5f..9eee774be2 100644 --- a/src/transformers/models/pix2struct/modeling_pix2struct.py +++ b/src/transformers/models/pix2struct/modeling_pix2struct.py @@ -349,7 +349,7 @@ class Pix2StructVisionEncoder(nn.Module): @auto_docstring class Pix2StructPreTrainedModel(PreTrainedModel): - config_class = Pix2StructConfig + config: Pix2StructConfig _supports_static_cache = False @@ -474,7 +474,7 @@ class Pix2StructPreTrainedModel(PreTrainedModel): @auto_docstring class Pix2StructVisionModel(Pix2StructPreTrainedModel): - config_class = Pix2StructVisionConfig + config: Pix2StructVisionConfig main_input_name = "flattened_patches" supports_gradient_checkpointing = True _no_split_modules = ["Pix2StructVisionLayer"] @@ -1013,7 +1013,7 @@ class Pix2StructTextBlock(GradientCheckpointingLayer): """ ) class Pix2StructTextModel(Pix2StructPreTrainedModel): - config_class = Pix2StructTextConfig + config: Pix2StructTextConfig _no_split_modules = ["Pix2StructTextBlock"] _tied_weights_keys = ["lm_head.weight"] supports_gradient_checkpointing = True @@ -1396,7 +1396,7 @@ class Pix2StructTextModel(Pix2StructPreTrainedModel): """ ) class Pix2StructForConditionalGeneration(Pix2StructPreTrainedModel, GenerationMixin): - config_class = Pix2StructConfig + config: Pix2StructConfig main_input_name = "flattened_patches" _tied_weights_keys = ["decoder.lm_head.weight"] diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py index 616f5810b2..831e500c57 100644 --- a/src/transformers/models/pixtral/modeling_pixtral.py +++ b/src/transformers/models/pixtral/modeling_pixtral.py @@ -400,7 +400,7 @@ class PixtralTransformer(nn.Module): @auto_docstring class PixtralPreTrainedModel(PreTrainedModel): - config_class = PixtralVisionConfig + config: PixtralVisionConfig base_model_prefix = "model" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py index 51c13b8732..52cb126e51 100644 --- a/src/transformers/models/plbart/modeling_plbart.py +++ b/src/transformers/models/plbart/modeling_plbart.py @@ -73,7 +73,7 @@ class PLBartScaledWordEmbedding(nn.Embedding): @auto_docstring class PLBartPreTrainedModel(PreTrainedModel): - config_class = PLBartConfig + config: PLBartConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["PLBartDecoderLayer", "PLBartEncoderLayer"] diff --git a/src/transformers/models/plbart/modular_plbart.py b/src/transformers/models/plbart/modular_plbart.py index 2aa8568954..3547b1da40 100644 --- a/src/transformers/models/plbart/modular_plbart.py +++ b/src/transformers/models/plbart/modular_plbart.py @@ -58,7 +58,7 @@ class PLBartScaledWordEmbedding(BartScaledWordEmbedding): @auto_docstring class PLBartPreTrainedModel(PreTrainedModel): - config_class = PLBartConfig + config: PLBartConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["PLBartDecoderLayer", "PLBartEncoderLayer"] diff --git a/src/transformers/models/poolformer/modeling_poolformer.py b/src/transformers/models/poolformer/modeling_poolformer.py index 8e01e39872..0c72944000 100755 --- a/src/transformers/models/poolformer/modeling_poolformer.py +++ b/src/transformers/models/poolformer/modeling_poolformer.py @@ -246,7 +246,7 @@ class PoolFormerEncoder(nn.Module): @auto_docstring class PoolFormerPreTrainedModel(PreTrainedModel): - config_class = PoolFormerConfig + config: PoolFormerConfig base_model_prefix = "poolformer" main_input_name = "pixel_values" _no_split_modules = ["PoolFormerLayer"] diff --git a/src/transformers/models/pop2piano/modeling_pop2piano.py b/src/transformers/models/pop2piano/modeling_pop2piano.py index 6b64c1fd8f..aab17019ed 100644 --- a/src/transformers/models/pop2piano/modeling_pop2piano.py +++ b/src/transformers/models/pop2piano/modeling_pop2piano.py @@ -572,7 +572,7 @@ class Pop2PianoBlock(GradientCheckpointingLayer): @auto_docstring class Pop2PianoPreTrainedModel(PreTrainedModel): - config_class = Pop2PianoConfig + config: Pop2PianoConfig base_model_prefix = "transformer" is_parallelizable = False supports_gradient_checkpointing = True diff --git a/src/transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py b/src/transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py index 2a463be89f..f8e9092a10 100644 --- a/src/transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +++ b/src/transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py @@ -240,7 +240,7 @@ class PromptDepthAnythingDepthEstimationHead(nn.Module): @auto_docstring class PromptDepthAnythingPreTrainedModel(PreTrainedModel): - config_class = PromptDepthAnythingConfig + config: PromptDepthAnythingConfig base_model_prefix = "prompt_depth_anything" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py b/src/transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py index 9563fa9884..fcd3c9c91f 100644 --- a/src/transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +++ b/src/transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py @@ -159,7 +159,7 @@ class PromptDepthAnythingDepthEstimationHead(DepthAnythingDepthEstimationHead): @auto_docstring class PromptDepthAnythingPreTrainedModel(PreTrainedModel): - config_class = PromptDepthAnythingConfig + config: PromptDepthAnythingConfig base_model_prefix = "prompt_depth_anything" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py index d9c7807d52..467194eafd 100644 --- a/src/transformers/models/prophetnet/modeling_prophetnet.py +++ b/src/transformers/models/prophetnet/modeling_prophetnet.py @@ -333,7 +333,7 @@ class ProphetNetDecoderLMOutput(ModelOutput): @auto_docstring class ProphetNetPreTrainedModel(PreTrainedModel): - config_class = ProphetNetConfig + config: ProphetNetConfig base_model_prefix = "prophetnet" supports_gradient_checkpointing = True diff --git a/src/transformers/models/pvt/modeling_pvt.py b/src/transformers/models/pvt/modeling_pvt.py index 5fb4f8269e..9517b0252e 100755 --- a/src/transformers/models/pvt/modeling_pvt.py +++ b/src/transformers/models/pvt/modeling_pvt.py @@ -442,7 +442,7 @@ class PvtEncoder(nn.Module): @auto_docstring class PvtPreTrainedModel(PreTrainedModel): - config_class = PvtConfig + config: PvtConfig base_model_prefix = "pvt" main_input_name = "pixel_values" _no_split_modules = [] diff --git a/src/transformers/models/pvt_v2/modeling_pvt_v2.py b/src/transformers/models/pvt_v2/modeling_pvt_v2.py index b357cb5970..0e077f41d8 100644 --- a/src/transformers/models/pvt_v2/modeling_pvt_v2.py +++ b/src/transformers/models/pvt_v2/modeling_pvt_v2.py @@ -388,7 +388,7 @@ class PvtV2Encoder(nn.Module): @auto_docstring class PvtV2PreTrainedModel(PreTrainedModel): - config_class = PvtV2Config + config: PvtV2Config base_model_prefix = "pvt_v2" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py index e58c08c223..eddfef4ced 100644 --- a/src/transformers/models/qwen2/modeling_qwen2.py +++ b/src/transformers/models/qwen2/modeling_qwen2.py @@ -254,7 +254,7 @@ class Qwen2DecoderLayer(GradientCheckpointingLayer): @auto_docstring class Qwen2PreTrainedModel(PreTrainedModel): - config_class = Qwen2Config + config: Qwen2Config base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["Qwen2DecoderLayer"] diff --git a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py index d576c801a4..10eeadd766 100644 --- a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py @@ -80,7 +80,7 @@ class Qwen2RMSNorm(nn.Module): @auto_docstring class Qwen2_5OmniPreTrainedModel(PreTrainedModel): - config_class = Qwen2_5OmniConfig + config: Qwen2_5OmniConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["Qwen2_5OmniDecoderLayer", "Qwen2_5OmniVisionBlock"] @@ -742,7 +742,7 @@ class SinusoidsPositionEmbedding(nn.Module): """ ) class Qwen2_5OmniAudioEncoder(Qwen2_5OmniPreTrainedModel): - config_class = Qwen2_5OmniAudioEncoderConfig + config: Qwen2_5OmniAudioEncoderConfig main_input_name = "input_features" _no_split_modules = ["Qwen2_5OmniAudioEncoderLayer"] _supports_sdpa = True @@ -1106,7 +1106,7 @@ class Qwen2_5OmniPatchMerger(nn.Module): class Qwen2_5OmniVisionEncoder(Qwen2_5OmniPreTrainedModel): - config_class = Qwen2_5OmniVisionEncoderConfig + config: Qwen2_5OmniVisionEncoderConfig _no_split_modules = ["Qwen2_5OmniVisionBlock"] def __init__(self, config: Qwen2_5OmniVisionEncoderConfig, *inputs, **kwargs) -> None: @@ -1531,7 +1531,7 @@ class Qwen2_5OmniDecoderLayer(GradientCheckpointingLayer): @auto_docstring class Qwen2_5OmniThinkerTextModel(Qwen2_5OmniPreTrainedModel): - config_class = Qwen2_5OmniTextConfig + config: Qwen2_5OmniTextConfig _no_split_modules = ["Qwen2_5OmniDecoderLayer"] def __init__(self, config: Qwen2_5OmniTextConfig): @@ -1683,7 +1683,7 @@ class Qwen2_5OmniThinkerTextModel(Qwen2_5OmniPreTrainedModel): """ ) class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForConditionalGeneration, GenerationMixin): - config_class = Qwen2_5OmniThinkerConfig + config: Qwen2_5OmniThinkerConfig base_model_prefix = "thinker" _no_split_modules = ["Qwen2_5OmniAudioEncoder", "Qwen2_5OmniVisionEncoder"] @@ -2079,7 +2079,7 @@ class Qwen2_5OmniTalkerCausalLMOutputWithPast(ModelOutput): @auto_docstring class Qwen2_5OmniTalkerModel(Qwen2_5OmniPreTrainedModel): - config_class = Qwen2_5OmniTalkerConfig + config: Qwen2_5OmniTalkerConfig _no_split_modules = ["Qwen2_5OmniTalkerDecoderLayer"] def __init__(self, config: Qwen2_5OmniTalkerConfig): @@ -2225,7 +2225,7 @@ class Qwen2_5OmniTalkerModel(Qwen2_5OmniPreTrainedModel): class Qwen2_5OmniTalkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForConditionalGeneration, GenerationMixin): - config_class = Qwen2_5OmniTalkerConfig + config: Qwen2_5OmniTalkerConfig base_model_prefix = "talker" def __init__(self, config: Qwen2_5OmniTalkerConfig): @@ -3329,7 +3329,7 @@ class AMPBlock(torch.nn.Module): """ ) class Qwen2_5OmniToken2WavBigVGANModel(Qwen2_5OmniPreTrainedModel): - config_class = Qwen2_5OmniBigVGANConfig + config: Qwen2_5OmniBigVGANConfig def __init__(self, config: Qwen2_5OmniBigVGANConfig): super().__init__(config) @@ -3464,7 +3464,7 @@ class RungeKutta4ODESolver: """ ) class Qwen2_5OmniToken2WavDiTModel(Qwen2_5OmniPreTrainedModel): - config_class = Qwen2_5OmniDiTConfig + config: Qwen2_5OmniDiTConfig _no_split_modules = ["DiTDecoderLayer"] def __init__(self, config: Qwen2_5OmniDiTConfig): @@ -3619,7 +3619,7 @@ class Qwen2_5OmniToken2WavDiTModel(Qwen2_5OmniPreTrainedModel): """ ) class Qwen2_5OmniToken2WavModel(Qwen2_5OmniPreTrainedModel): - config_class = Qwen2_5OmniToken2WavConfig + config: Qwen2_5OmniToken2WavConfig base_model_prefix = "model" _no_split_modules = ["Qwen2_5OmniToken2WavDiTModel", "Qwen2_5OmniToken2WavBigVGANModel"] @@ -3687,7 +3687,7 @@ class Qwen2_5OmniToken2WavModel(Qwen2_5OmniPreTrainedModel): """ ) class Qwen2_5OmniForConditionalGeneration(Qwen2_5OmniPreTrainedModel, GenerationMixin): - config_class = Qwen2_5OmniConfig + config: Qwen2_5OmniConfig _no_split_modules = [ "Qwen2_5OmniTalkerForConditionalGeneration", "Qwen2_5OmniToken2WavModel", diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py index 62aadd2804..34e61e0b26 100644 --- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py @@ -1131,7 +1131,7 @@ class Qwen2_5OmniConfig(PretrainedConfig): class Qwen2_5OmniPreTrainedModel(Qwen2_5_VLPreTrainedModel): - config_class = Qwen2_5OmniConfig + config: Qwen2_5OmniConfig _supports_static_cache = False def _init_weights(self, module): @@ -1729,7 +1729,7 @@ class SinusoidsPositionEmbedding(nn.Module): """ ) class Qwen2_5OmniAudioEncoder(Qwen2_5OmniPreTrainedModel): - config_class = Qwen2_5OmniAudioEncoderConfig + config: Qwen2_5OmniAudioEncoderConfig main_input_name = "input_features" _no_split_modules = ["Qwen2_5OmniAudioEncoderLayer"] _supports_sdpa = True @@ -2015,7 +2015,7 @@ class Qwen2_5OmniVisionBlock(Qwen2_5_VLVisionBlock): class Qwen2_5OmniVisionEncoder(Qwen2_5_VisionTransformerPretrainedModel): - config_class = Qwen2_5OmniVisionEncoderConfig + config: Qwen2_5OmniVisionEncoderConfig _no_split_modules = ["Qwen2_5OmniVisionBlock"] def __init__(self, config: Qwen2_5OmniVisionEncoderConfig, *inputs, **kwargs) -> None: @@ -2125,7 +2125,7 @@ class Qwen2MLP(Qwen2_5_VLMLP): class Qwen2_5OmniThinkerTextModel(Qwen2_5_VLTextModel): - config_class = Qwen2_5OmniTextConfig + config: Qwen2_5OmniTextConfig _no_split_modules = ["Qwen2_5OmniDecoderLayer"] def __init__(self, config: Qwen2_5OmniTextConfig): @@ -2138,7 +2138,7 @@ class Qwen2_5OmniThinkerTextModel(Qwen2_5_VLTextModel): """ ) class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForConditionalGeneration, GenerationMixin): - config_class = Qwen2_5OmniThinkerConfig + config: Qwen2_5OmniThinkerConfig base_model_prefix = "thinker" _no_split_modules = ["Qwen2_5OmniAudioEncoder", "Qwen2_5OmniVisionEncoder"] @@ -2533,7 +2533,7 @@ class Qwen2_5OmniTalkerCausalLMOutputWithPast(ModelOutput): class Qwen2_5OmniTalkerModel(Qwen2_5_VLTextModel): - config_class = Qwen2_5OmniTalkerConfig + config: Qwen2_5OmniTalkerConfig _no_split_modules = ["Qwen2_5OmniTalkerDecoderLayer"] def __init__(self, config: Qwen2_5OmniTalkerConfig): @@ -2542,7 +2542,7 @@ class Qwen2_5OmniTalkerModel(Qwen2_5_VLTextModel): class Qwen2_5OmniTalkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForConditionalGeneration, GenerationMixin): - config_class = Qwen2_5OmniTalkerConfig + config: Qwen2_5OmniTalkerConfig base_model_prefix = "talker" def __init__(self, config: Qwen2_5OmniTalkerConfig): @@ -3646,7 +3646,7 @@ class AMPBlock(torch.nn.Module): """ ) class Qwen2_5OmniToken2WavBigVGANModel(Qwen2_5OmniPreTrainedModel): - config_class = Qwen2_5OmniBigVGANConfig + config: Qwen2_5OmniBigVGANConfig def __init__(self, config: Qwen2_5OmniBigVGANConfig): super().__init__(config) @@ -3781,7 +3781,7 @@ class RungeKutta4ODESolver: """ ) class Qwen2_5OmniToken2WavDiTModel(Qwen2_5OmniPreTrainedModel): - config_class = Qwen2_5OmniDiTConfig + config: Qwen2_5OmniDiTConfig _no_split_modules = ["DiTDecoderLayer"] def __init__(self, config: Qwen2_5OmniDiTConfig): @@ -3936,7 +3936,7 @@ class Qwen2_5OmniToken2WavDiTModel(Qwen2_5OmniPreTrainedModel): """ ) class Qwen2_5OmniToken2WavModel(Qwen2_5OmniPreTrainedModel): - config_class = Qwen2_5OmniToken2WavConfig + config: Qwen2_5OmniToken2WavConfig base_model_prefix = "model" _no_split_modules = ["Qwen2_5OmniToken2WavDiTModel", "Qwen2_5OmniToken2WavBigVGANModel"] @@ -4004,7 +4004,7 @@ class Qwen2_5OmniToken2WavModel(Qwen2_5OmniPreTrainedModel): """ ) class Qwen2_5OmniForConditionalGeneration(Qwen2_5OmniPreTrainedModel, GenerationMixin): - config_class = Qwen2_5OmniConfig + config: Qwen2_5OmniConfig _no_split_modules = [ "Qwen2_5OmniTalkerForConditionalGeneration", "Qwen2_5OmniToken2WavModel", diff --git a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index cf2d802abb..7cf76017b7 100644 --- a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -318,7 +318,7 @@ class Qwen2_5_VLVisionBlock(GradientCheckpointingLayer): @auto_docstring class Qwen2_5_VLPreTrainedModel(PreTrainedModel): - config_class = Qwen2_5_VLConfig + config: Qwen2_5_VLConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"] @@ -344,7 +344,7 @@ class Qwen2_5_VLPreTrainedModel(PreTrainedModel): class Qwen2_5_VisionTransformerPretrainedModel(Qwen2_5_VLPreTrainedModel): - config_class = Qwen2_5_VLVisionConfig + config: Qwen2_5_VLVisionConfig _no_split_modules = ["Qwen2_5_VLVisionBlock"] def __init__(self, config, *inputs, **kwargs) -> None: @@ -803,7 +803,7 @@ class Qwen2_5_VLDecoderLayer(GradientCheckpointingLayer): @auto_docstring class Qwen2_5_VLTextModel(Qwen2_5_VLPreTrainedModel): - config_class = Qwen2_5_VLTextConfig + config: Qwen2_5_VLTextConfig def __init__(self, config: Qwen2_5_VLTextConfig): super().__init__(config) @@ -952,7 +952,7 @@ class Qwen2_5_VLTextModel(Qwen2_5_VLPreTrainedModel): class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel): base_model_prefix = "" _checkpoint_conversion_mapping = {"^model": "language_model"} - config_class = Qwen2_5_VLConfig + config: Qwen2_5_VLConfig _no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"] def __init__(self, config): diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py index bd9d516120..000d752165 100644 --- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py @@ -189,7 +189,7 @@ class Qwen2_5_VLPreTrainedModel(Qwen2VLPreTrainedModel): class Qwen2_5_VisionTransformerPretrainedModel(Qwen2_5_VLPreTrainedModel): - config_class = Qwen2_5_VLVisionConfig + config: Qwen2_5_VLVisionConfig _no_split_modules = ["Qwen2_5_VLVisionBlock"] def __init__(self, config, *inputs, **kwargs) -> None: @@ -354,7 +354,7 @@ class Qwen2_5_VLModelOutputWithPast(Qwen2VLModelOutputWithPast): class Qwen2_5_VLModel(Qwen2VLModel): - config_class = Qwen2_5_VLConfig + config: Qwen2_5_VLConfig base_model_prefix = "" _no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"] diff --git a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py index eafcbaf019..fb9d013eef 100644 --- a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py +++ b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py @@ -257,7 +257,7 @@ class Qwen2AudioEncoderLayer(GradientCheckpointingLayer): @auto_docstring class Qwen2AudioPreTrainedModel(PreTrainedModel): - config_class = Qwen2AudioConfig + config: Qwen2AudioConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["Qwen2AudioAttention"] @@ -303,7 +303,7 @@ class Qwen2AudioEncoder(Qwen2AudioPreTrainedModel): """ # Ignore copy - config_class = Qwen2AudioEncoderConfig + config: Qwen2AudioEncoderConfig main_input_name = "input_features" _no_split_modules = ["Qwen2AudioEncoderLayer"] diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index 9503c92bff..dc70ad547f 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -739,7 +739,7 @@ class Qwen2MoeDecoderLayer(GradientCheckpointingLayer): @auto_docstring class Qwen2MoePreTrainedModel(PreTrainedModel): - config_class = Qwen2MoeConfig + config: Qwen2MoeConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["Qwen2MoeDecoderLayer"] diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index d2f9b535c9..6f88158558 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -651,7 +651,7 @@ class Qwen2VLDecoderLayer(GradientCheckpointingLayer): @auto_docstring class Qwen2VLPreTrainedModel(PreTrainedModel): - config_class = Qwen2VLConfig + config: Qwen2VLConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["Qwen2VLDecoderLayer", "Qwen2VLVisionBlock"] @@ -681,7 +681,7 @@ class Qwen2VLPreTrainedModel(PreTrainedModel): @auto_docstring class Qwen2VisionTransformerPretrainedModel(Qwen2VLPreTrainedModel): - config_class = Qwen2VLVisionConfig + config: Qwen2VLVisionConfig _no_split_modules = ["Qwen2VLVisionBlock"] def __init__(self, config) -> None: @@ -778,7 +778,7 @@ class Qwen2VisionTransformerPretrainedModel(Qwen2VLPreTrainedModel): @auto_docstring class Qwen2VLTextModel(Qwen2VLPreTrainedModel): - config_class = Qwen2VLTextConfig + config: Qwen2VLTextConfig def __init__(self, config: Qwen2VLTextConfig): super().__init__(config) diff --git a/src/transformers/models/qwen3/modeling_qwen3.py b/src/transformers/models/qwen3/modeling_qwen3.py index b6e537ec18..4af78a109f 100644 --- a/src/transformers/models/qwen3/modeling_qwen3.py +++ b/src/transformers/models/qwen3/modeling_qwen3.py @@ -280,7 +280,7 @@ class Qwen3DecoderLayer(GradientCheckpointingLayer): @auto_docstring class Qwen3PreTrainedModel(PreTrainedModel): - config_class = Qwen3Config + config: Qwen3Config base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["Qwen3DecoderLayer"] diff --git a/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py b/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py index f7b180558a..581dd2faba 100644 --- a/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py +++ b/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py @@ -403,7 +403,7 @@ class Qwen3MoeRotaryEmbedding(nn.Module): @auto_docstring class Qwen3MoePreTrainedModel(PreTrainedModel): - config_class = Qwen3MoeConfig + config: Qwen3MoeConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["Qwen3MoeDecoderLayer"] diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py index 89f9f7d1b9..4cb08b1bc4 100644 --- a/src/transformers/models/rag/modeling_rag.py +++ b/src/transformers/models/rag/modeling_rag.py @@ -232,7 +232,7 @@ class RetrievAugLMOutput(ModelOutput): ) @auto_docstring class RagPreTrainedModel(PreTrainedModel): - config_class = RagConfig + config: RagConfig base_model_prefix = "rag" _supports_flash_attn = True _supports_sdpa = True diff --git a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py index b6c2325a69..833b1689c2 100644 --- a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +++ b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py @@ -504,7 +504,7 @@ class RecurrentGemmaDecoderLayer(GradientCheckpointingLayer): @auto_docstring class RecurrentGemmaPreTrainedModel(PreTrainedModel): - config_class = RecurrentGemmaConfig + config: RecurrentGemmaConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["RecurrentGemmaDecoderLayer"] diff --git a/src/transformers/models/reformer/modeling_reformer.py b/src/transformers/models/reformer/modeling_reformer.py index d0a80755b7..97cb08deb9 100755 --- a/src/transformers/models/reformer/modeling_reformer.py +++ b/src/transformers/models/reformer/modeling_reformer.py @@ -1900,7 +1900,7 @@ class ReformerOnlyLMHead(nn.Module): @auto_docstring class ReformerPreTrainedModel(PreTrainedModel): - config_class = ReformerConfig + config: ReformerConfig base_model_prefix = "reformer" @property diff --git a/src/transformers/models/regnet/modeling_regnet.py b/src/transformers/models/regnet/modeling_regnet.py index aeb333e8b0..c9cdda640b 100644 --- a/src/transformers/models/regnet/modeling_regnet.py +++ b/src/transformers/models/regnet/modeling_regnet.py @@ -260,7 +260,7 @@ class RegNetEncoder(nn.Module): @auto_docstring class RegNetPreTrainedModel(PreTrainedModel): - config_class = RegNetConfig + config: RegNetConfig base_model_prefix = "regnet" main_input_name = "pixel_values" _no_split_modules = ["RegNetYLayer"] diff --git a/src/transformers/models/rembert/modeling_rembert.py b/src/transformers/models/rembert/modeling_rembert.py index 93dd4a31fc..c8a81319f3 100755 --- a/src/transformers/models/rembert/modeling_rembert.py +++ b/src/transformers/models/rembert/modeling_rembert.py @@ -626,7 +626,7 @@ class RemBertOnlyMLMHead(nn.Module): @auto_docstring class RemBertPreTrainedModel(PreTrainedModel): - config_class = RemBertConfig + config: RemBertConfig load_tf_weights = load_tf_weights_in_rembert base_model_prefix = "rembert" supports_gradient_checkpointing = True diff --git a/src/transformers/models/resnet/modeling_resnet.py b/src/transformers/models/resnet/modeling_resnet.py index c913bb1bdd..266d148bcc 100644 --- a/src/transformers/models/resnet/modeling_resnet.py +++ b/src/transformers/models/resnet/modeling_resnet.py @@ -246,7 +246,7 @@ class ResNetEncoder(nn.Module): @auto_docstring class ResNetPreTrainedModel(PreTrainedModel): - config_class = ResNetConfig + config: ResNetConfig base_model_prefix = "resnet" main_input_name = "pixel_values" _no_split_modules = ["ResNetConvLayer", "ResNetShortCut"] diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py index 003b2fa519..998685ccc1 100644 --- a/src/transformers/models/roberta/modeling_roberta.py +++ b/src/transformers/models/roberta/modeling_roberta.py @@ -690,7 +690,7 @@ class RobertaPooler(nn.Module): @auto_docstring class RobertaPreTrainedModel(PreTrainedModel): - config_class = RobertaConfig + config: RobertaConfig base_model_prefix = "roberta" supports_gradient_checkpointing = True _no_split_modules = ["RobertaEmbeddings", "RobertaSelfAttention", "RobertaSdpaSelfAttention"] diff --git a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py index e04faa1b63..d778a42703 100644 --- a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +++ b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py @@ -562,7 +562,7 @@ class RobertaPreLayerNormPooler(nn.Module): @auto_docstring class RobertaPreLayerNormPreTrainedModel(PreTrainedModel): - config_class = RobertaPreLayerNormConfig + config: RobertaPreLayerNormConfig base_model_prefix = "roberta_prelayernorm" supports_gradient_checkpointing = True _no_split_modules = ["RobertaPreLayerNormEmbeddings", "RobertaPreLayerNormSelfAttention"] diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py index 8d98140aff..38111c817c 100644 --- a/src/transformers/models/roc_bert/modeling_roc_bert.py +++ b/src/transformers/models/roc_bert/modeling_roc_bert.py @@ -734,7 +734,7 @@ class RoCBertOnlyMLMHead(nn.Module): @auto_docstring class RoCBertPreTrainedModel(PreTrainedModel): - config_class = RoCBertConfig + config: RoCBertConfig load_tf_weights = load_tf_weights_in_roc_bert base_model_prefix = "roc_bert" supports_gradient_checkpointing = True diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py index 58c2320ecd..0fa8ac4e0c 100644 --- a/src/transformers/models/roformer/modeling_roformer.py +++ b/src/transformers/models/roformer/modeling_roformer.py @@ -749,7 +749,7 @@ class RoFormerOnlyMLMHead(nn.Module): @auto_docstring class RoFormerPreTrainedModel(PreTrainedModel): - config_class = RoFormerConfig + config: RoFormerConfig load_tf_weights = load_tf_weights_in_roformer base_model_prefix = "roformer" supports_gradient_checkpointing = True diff --git a/src/transformers/models/rt_detr/modeling_rt_detr.py b/src/transformers/models/rt_detr/modeling_rt_detr.py index c3fe1c4a99..6ec7edbb2b 100644 --- a/src/transformers/models/rt_detr/modeling_rt_detr.py +++ b/src/transformers/models/rt_detr/modeling_rt_detr.py @@ -1000,7 +1000,7 @@ class RTDetrDecoderLayer(nn.Module): @auto_docstring class RTDetrPreTrainedModel(PreTrainedModel): - config_class = RTDetrConfig + config: RTDetrConfig base_model_prefix = "rt_detr" main_input_name = "pixel_values" _no_split_modules = [r"RTDetrHybridEncoder", r"RTDetrDecoderLayer"] diff --git a/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py b/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py index 17833c7037..770a9b0326 100644 --- a/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py +++ b/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py @@ -297,7 +297,7 @@ class RTDetrResNetEncoder(nn.Module): @auto_docstring # Copied from transformers.models.resnet.modeling_resnet.ResNetPreTrainedModel with ResNet->RTDetrResNet class RTDetrResNetPreTrainedModel(PreTrainedModel): - config_class = RTDetrResNetConfig + config: RTDetrResNetConfig base_model_prefix = "resnet" main_input_name = "pixel_values" _no_split_modules = ["RTDetrResNetConvLayer", "RTDetrResNetShortCut"] diff --git a/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py b/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py index 03a8b09c84..3589b25761 100644 --- a/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +++ b/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py @@ -1214,7 +1214,7 @@ def _get_clones(partial_module, N): @auto_docstring class RTDetrV2PreTrainedModel(PreTrainedModel): - config_class = RTDetrV2Config + config: RTDetrV2Config base_model_prefix = "rt_detr_v2" main_input_name = "pixel_values" _no_split_modules = [r"RTDetrV2HybridEncoder", r"RTDetrV2DecoderLayer"] diff --git a/src/transformers/models/rwkv/modeling_rwkv.py b/src/transformers/models/rwkv/modeling_rwkv.py index 4614eaff3e..cbc6fc81f0 100644 --- a/src/transformers/models/rwkv/modeling_rwkv.py +++ b/src/transformers/models/rwkv/modeling_rwkv.py @@ -381,7 +381,7 @@ class RwkvBlock(GradientCheckpointingLayer): @auto_docstring class RwkvPreTrainedModel(PreTrainedModel): - config_class = RwkvConfig + config: RwkvConfig base_model_prefix = "rwkv" _no_split_modules = ["RwkvBlock"] _keep_in_fp32_modules = ["time_decay", "time_first"] diff --git a/src/transformers/models/sam/modeling_sam.py b/src/transformers/models/sam/modeling_sam.py index c57322a102..42f9609b5e 100644 --- a/src/transformers/models/sam/modeling_sam.py +++ b/src/transformers/models/sam/modeling_sam.py @@ -1026,7 +1026,7 @@ class SamVisionNeck(nn.Module): @auto_docstring class SamPreTrainedModel(PreTrainedModel): - config_class = SamConfig + config: SamConfig base_model_prefix = "sam" main_input_name = "pixel_values" _no_split_modules = ["SamVisionAttention"] @@ -1112,7 +1112,7 @@ class SamVisionEncoder(SamPreTrainedModel): """ ) class SamVisionModel(SamPreTrainedModel): - config_class = SamVisionConfig + config: SamVisionConfig main_input_name = "pixel_values" def __init__(self, config: SamVisionConfig): diff --git a/src/transformers/models/sam_hq/modeling_sam_hq.py b/src/transformers/models/sam_hq/modeling_sam_hq.py index cee9d730cf..042e7ab7c0 100644 --- a/src/transformers/models/sam_hq/modeling_sam_hq.py +++ b/src/transformers/models/sam_hq/modeling_sam_hq.py @@ -475,7 +475,7 @@ class SamHQVisionNeck(nn.Module): @auto_docstring class SamHQPreTrainedModel(PreTrainedModel): - config_class = SamHQConfig + config: SamHQConfig base_model_prefix = "sam_hq" main_input_name = "pixel_values" _no_split_modules = ["SamHQVisionAttention"] @@ -1076,7 +1076,7 @@ class SamHQMaskDecoder(nn.Module): """ ) class SamHQVisionModel(SamHQPreTrainedModel): - config_class = SamHQVisionConfig + config: SamHQVisionConfig main_input_name = "pixel_values" def __init__(self, config: SamHQVisionConfig): diff --git a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py index d461ee6e3d..494e9000ca 100755 --- a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py +++ b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py @@ -1338,7 +1338,7 @@ class SeamlessM4TDecoderLayer(GradientCheckpointingLayer): @auto_docstring class SeamlessM4TPreTrainedModel(PreTrainedModel): - config_class = SeamlessM4TConfig + config: SeamlessM4TConfig base_model_prefix = "seamless_m4t" supports_gradient_checkpointing = True _no_split_modules = ["SeamlessM4TEncoderLayer", "SeamlessM4TDecoderLayer", "SeamlessM4TConformerEncoderLayer"] @@ -2302,7 +2302,7 @@ class SeamlessM4THifiGan(nn.Module): """ ) class SeamlessM4TCodeHifiGan(PreTrainedModel): - config_class = SeamlessM4TConfig + config: SeamlessM4TConfig main_input_name = "input_embeds" _no_split_modules = [] diff --git a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py index 3f4595eeee..b920ced313 100644 --- a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +++ b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py @@ -1250,7 +1250,7 @@ class SeamlessM4Tv2TextToUnitDecoderLayer(GradientCheckpointingLayer): @auto_docstring class SeamlessM4Tv2PreTrainedModel(PreTrainedModel): - config_class = SeamlessM4Tv2Config + config: SeamlessM4Tv2Config base_model_prefix = "seamless_m4t_v2" supports_gradient_checkpointing = True _no_split_modules = [ @@ -2505,7 +2505,7 @@ class SeamlessM4Tv2HifiGan(nn.Module): """ ) class SeamlessM4Tv2CodeHifiGan(PreTrainedModel): - config_class = SeamlessM4Tv2Config + config: SeamlessM4Tv2Config main_input_name = "input_embeds" _no_split_modules = [] diff --git a/src/transformers/models/segformer/modeling_segformer.py b/src/transformers/models/segformer/modeling_segformer.py index b998e0546d..559c8592ba 100755 --- a/src/transformers/models/segformer/modeling_segformer.py +++ b/src/transformers/models/segformer/modeling_segformer.py @@ -435,7 +435,7 @@ class SegformerEncoder(nn.Module): @auto_docstring class SegformerPreTrainedModel(PreTrainedModel): - config_class = SegformerConfig + config: SegformerConfig base_model_prefix = "segformer" main_input_name = "pixel_values" diff --git a/src/transformers/models/seggpt/modeling_seggpt.py b/src/transformers/models/seggpt/modeling_seggpt.py index 364483359e..bbbaf16866 100644 --- a/src/transformers/models/seggpt/modeling_seggpt.py +++ b/src/transformers/models/seggpt/modeling_seggpt.py @@ -598,7 +598,7 @@ class SegGptDecoder(nn.Module): @auto_docstring class SegGptPreTrainedModel(PreTrainedModel): - config_class = SegGptConfig + config: SegGptConfig base_model_prefix = "model" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py index b21a2a12cc..97dd63a548 100644 --- a/src/transformers/models/sew/modeling_sew.py +++ b/src/transformers/models/sew/modeling_sew.py @@ -513,7 +513,7 @@ class SEWEncoder(nn.Module): @auto_docstring class SEWPreTrainedModel(PreTrainedModel): - config_class = SEWConfig + config: SEWConfig base_model_prefix = "sew" main_input_name = "input_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/sew/modular_sew.py b/src/transformers/models/sew/modular_sew.py index b093987548..4614892b3d 100644 --- a/src/transformers/models/sew/modular_sew.py +++ b/src/transformers/models/sew/modular_sew.py @@ -259,7 +259,7 @@ class SEWEncoder(nn.Module): @auto_docstring class SEWPreTrainedModel(PreTrainedModel): - config_class = SEWConfig + config: SEWConfig base_model_prefix = "sew" main_input_name = "input_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py index 678cddde30..53eeda91f3 100644 --- a/src/transformers/models/sew_d/modeling_sew_d.py +++ b/src/transformers/models/sew_d/modeling_sew_d.py @@ -1183,7 +1183,7 @@ class SEWDEncoder(nn.Module): @auto_docstring class SEWDPreTrainedModel(PreTrainedModel): - config_class = SEWDConfig + config: SEWDConfig base_model_prefix = "sew-d" main_input_name = "input_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/shieldgemma2/modeling_shieldgemma2.py b/src/transformers/models/shieldgemma2/modeling_shieldgemma2.py index a77ea28a22..e27c01cb59 100644 --- a/src/transformers/models/shieldgemma2/modeling_shieldgemma2.py +++ b/src/transformers/models/shieldgemma2/modeling_shieldgemma2.py @@ -44,7 +44,7 @@ class ShieldGemma2ImageClassifierOutputWithNoAttention(ImageClassifierOutputWith @auto_docstring class ShieldGemma2ForImageClassification(PreTrainedModel): - config_class = ShieldGemma2Config + config: ShieldGemma2Config _checkpoint_conversion_mapping = { "model.language_model.model": "model.model.language_model", "model.vision_tower": "model.model.vision_tower", diff --git a/src/transformers/models/siglip/modeling_siglip.py b/src/transformers/models/siglip/modeling_siglip.py index dfc252c473..ef86988f52 100644 --- a/src/transformers/models/siglip/modeling_siglip.py +++ b/src/transformers/models/siglip/modeling_siglip.py @@ -469,7 +469,7 @@ class SiglipEncoderLayer(GradientCheckpointingLayer): @auto_docstring class SiglipPreTrainedModel(PreTrainedModel): - config_class = SiglipConfig + config: SiglipConfig base_model_prefix = "siglip" supports_gradient_checkpointing = True @@ -681,7 +681,7 @@ class SiglipTextTransformer(nn.Module): """ ) class SiglipTextModel(SiglipPreTrainedModel): - config_class = SiglipTextConfig + config: SiglipTextConfig def __init__(self, config: SiglipTextConfig): super().__init__(config) @@ -809,7 +809,7 @@ class SiglipMultiheadAttentionPoolingHead(nn.Module): """ ) class SiglipVisionModel(SiglipPreTrainedModel): - config_class = SiglipVisionConfig + config: SiglipVisionConfig main_input_name = "pixel_values" def __init__(self, config: SiglipVisionConfig): @@ -863,7 +863,7 @@ class SiglipVisionModel(SiglipPreTrainedModel): @auto_docstring class SiglipModel(SiglipPreTrainedModel): - config_class = SiglipConfig + config: SiglipConfig def __init__(self, config: SiglipConfig): super().__init__(config) diff --git a/src/transformers/models/siglip2/modeling_siglip2.py b/src/transformers/models/siglip2/modeling_siglip2.py index 2ff20c8b23..44249acb82 100644 --- a/src/transformers/models/siglip2/modeling_siglip2.py +++ b/src/transformers/models/siglip2/modeling_siglip2.py @@ -702,7 +702,7 @@ class Siglip2TextTransformer(nn.Module): @auto_docstring class Siglip2PreTrainedModel(PreTrainedModel): - config_class = Siglip2Config + config: Siglip2Config base_model_prefix = "siglip2" supports_gradient_checkpointing = True @@ -770,7 +770,7 @@ class Siglip2PreTrainedModel(PreTrainedModel): """ ) class Siglip2TextModel(Siglip2PreTrainedModel): - config_class = Siglip2TextConfig + config: Siglip2TextConfig def __init__(self, config: Siglip2TextConfig): super().__init__(config) @@ -857,7 +857,7 @@ class Siglip2MultiheadAttentionPoolingHead(nn.Module): """ ) class Siglip2VisionModel(Siglip2PreTrainedModel): - config_class = Siglip2VisionConfig + config: Siglip2VisionConfig main_input_name = "pixel_values" def __init__(self, config: Siglip2VisionConfig): @@ -917,7 +917,7 @@ class Siglip2VisionModel(Siglip2PreTrainedModel): @auto_docstring class Siglip2Model(Siglip2PreTrainedModel): - config_class = Siglip2Config + config: Siglip2Config def __init__(self, config: Siglip2Config): super().__init__(config) diff --git a/src/transformers/models/smollm3/modeling_smollm3.py b/src/transformers/models/smollm3/modeling_smollm3.py index afa85c915b..c81a204263 100644 --- a/src/transformers/models/smollm3/modeling_smollm3.py +++ b/src/transformers/models/smollm3/modeling_smollm3.py @@ -284,7 +284,7 @@ class SmolLM3DecoderLayer(GradientCheckpointingLayer): @auto_docstring class SmolLM3PreTrainedModel(PreTrainedModel): - config_class = SmolLM3Config + config: SmolLM3Config base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["SmolLM3DecoderLayer"] diff --git a/src/transformers/models/smolvlm/modeling_smolvlm.py b/src/transformers/models/smolvlm/modeling_smolvlm.py index a0da6da350..dc1a6290d7 100644 --- a/src/transformers/models/smolvlm/modeling_smolvlm.py +++ b/src/transformers/models/smolvlm/modeling_smolvlm.py @@ -49,7 +49,7 @@ logger = logging.get_logger(__name__) @auto_docstring class SmolVLMPreTrainedModel(PreTrainedModel): - config_class = SmolVLMConfig + config: SmolVLMConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["SmolVLMVisionAttention", "SmolVLMDecoderLayer"] @@ -370,7 +370,7 @@ class SmolVLMEncoder(nn.Module): """ ) class SmolVLMVisionTransformer(SmolVLMPreTrainedModel): - config_class = SmolVLMVisionConfig + config: SmolVLMVisionConfig _supports_sdpa = True _supports_flash_attn = True _supports_flex_attn = True diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py index f5451a0d1e..0b17ce1548 100644 --- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py @@ -61,7 +61,7 @@ class SpeechEncoderDecoderModel(PreTrainedModel, GenerationMixin): :meth*~transformers.AutoModelForCausalLM.from_pretrained* class method for the decoder. """ - config_class = SpeechEncoderDecoderConfig + config: SpeechEncoderDecoderConfig base_model_prefix = "speech_encoder_decoder" main_input_name = "inputs" supports_gradient_checkpointing = True diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py index ae8a1595b8..63e4cca6f3 100755 --- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py @@ -507,7 +507,7 @@ class Speech2TextDecoderLayer(GradientCheckpointingLayer): @auto_docstring class Speech2TextPreTrainedModel(PreTrainedModel): - config_class = Speech2TextConfig + config: Speech2TextConfig base_model_prefix = "model" main_input_name = "input_features" supports_gradient_checkpointing = True diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py index 00655c4060..370eb83fe3 100644 --- a/src/transformers/models/speecht5/modeling_speecht5.py +++ b/src/transformers/models/speecht5/modeling_speecht5.py @@ -1203,7 +1203,7 @@ class SpeechT5DecoderLayer(GradientCheckpointingLayer): @auto_docstring class SpeechT5PreTrainedModel(PreTrainedModel): - config_class = SpeechT5Config + config: SpeechT5Config base_model_prefix = "speecht5" main_input_name = "input_values" supports_gradient_checkpointing = True @@ -3123,7 +3123,7 @@ class HifiGanResidualBlock(nn.Module): """ ) class SpeechT5HifiGan(PreTrainedModel): - config_class = SpeechT5HifiGanConfig + config: SpeechT5HifiGanConfig main_input_name = "spectrogram" def __init__(self, config: SpeechT5HifiGanConfig): diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py index d11a1eb60d..a5d68a5a52 100755 --- a/src/transformers/models/splinter/modeling_splinter.py +++ b/src/transformers/models/splinter/modeling_splinter.py @@ -390,7 +390,7 @@ class SplinterEncoder(nn.Module): @auto_docstring class SplinterPreTrainedModel(PreTrainedModel): - config_class = SplinterConfig + config: SplinterConfig base_model_prefix = "splinter" supports_gradient_checkpointing = True diff --git a/src/transformers/models/squeezebert/modeling_squeezebert.py b/src/transformers/models/squeezebert/modeling_squeezebert.py index 8175baf687..a732e76bda 100644 --- a/src/transformers/models/squeezebert/modeling_squeezebert.py +++ b/src/transformers/models/squeezebert/modeling_squeezebert.py @@ -417,7 +417,7 @@ class SqueezeBertOnlyMLMHead(nn.Module): @auto_docstring class SqueezeBertPreTrainedModel(PreTrainedModel): - config_class = SqueezeBertConfig + config: SqueezeBertConfig base_model_prefix = "transformer" def _init_weights(self, module): diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py index 9e95112028..89fe06748c 100755 --- a/src/transformers/models/stablelm/modeling_stablelm.py +++ b/src/transformers/models/stablelm/modeling_stablelm.py @@ -613,7 +613,7 @@ class StableLmDecoderLayer(GradientCheckpointingLayer): @auto_docstring class StableLmPreTrainedModel(PreTrainedModel): - config_class = StableLmConfig + config: StableLmConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["StableLmDecoderLayer"] diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py index 9e95bd88b8..19208dbda1 100644 --- a/src/transformers/models/starcoder2/modeling_starcoder2.py +++ b/src/transformers/models/starcoder2/modeling_starcoder2.py @@ -288,7 +288,7 @@ class Starcoder2RotaryEmbedding(nn.Module): @auto_docstring class Starcoder2PreTrainedModel(PreTrainedModel): - config_class = Starcoder2Config + config: Starcoder2Config base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["Starcoder2DecoderLayer"] diff --git a/src/transformers/models/superglue/modeling_superglue.py b/src/transformers/models/superglue/modeling_superglue.py index 56506bc7a2..da0dcfac92 100644 --- a/src/transformers/models/superglue/modeling_superglue.py +++ b/src/transformers/models/superglue/modeling_superglue.py @@ -517,7 +517,7 @@ class SuperGlueFinalProjection(nn.Module): @auto_docstring class SuperGluePreTrainedModel(PreTrainedModel): - config_class = SuperGlueConfig + config: SuperGlueConfig base_model_prefix = "superglue" main_input_name = "pixel_values" diff --git a/src/transformers/models/superpoint/modeling_superpoint.py b/src/transformers/models/superpoint/modeling_superpoint.py index b80e5fa02b..efd3113eb3 100644 --- a/src/transformers/models/superpoint/modeling_superpoint.py +++ b/src/transformers/models/superpoint/modeling_superpoint.py @@ -322,7 +322,7 @@ class SuperPointDescriptorDecoder(nn.Module): @auto_docstring class SuperPointPreTrainedModel(PreTrainedModel): - config_class = SuperPointConfig + config: SuperPointConfig base_model_prefix = "superpoint" main_input_name = "pixel_values" supports_gradient_checkpointing = False diff --git a/src/transformers/models/swiftformer/modeling_swiftformer.py b/src/transformers/models/swiftformer/modeling_swiftformer.py index 5d9707dea8..acc7e8726d 100644 --- a/src/transformers/models/swiftformer/modeling_swiftformer.py +++ b/src/transformers/models/swiftformer/modeling_swiftformer.py @@ -388,7 +388,7 @@ class SwiftFormerEncoder(nn.Module): @auto_docstring class SwiftFormerPreTrainedModel(PreTrainedModel): - config_class = SwiftFormerConfig + config: SwiftFormerConfig base_model_prefix = "swiftformer" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py index 5bd79aec33..18b3fba058 100644 --- a/src/transformers/models/swin/modeling_swin.py +++ b/src/transformers/models/swin/modeling_swin.py @@ -848,7 +848,7 @@ class SwinEncoder(nn.Module): @auto_docstring class SwinPreTrainedModel(PreTrainedModel): - config_class = SwinConfig + config: SwinConfig base_model_prefix = "swin" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/swin2sr/modeling_swin2sr.py b/src/transformers/models/swin2sr/modeling_swin2sr.py index de61e5b2d2..be92ac9e70 100644 --- a/src/transformers/models/swin2sr/modeling_swin2sr.py +++ b/src/transformers/models/swin2sr/modeling_swin2sr.py @@ -723,7 +723,7 @@ class Swin2SREncoder(nn.Module): @auto_docstring class Swin2SRPreTrainedModel(PreTrainedModel): - config_class = Swin2SRConfig + config: Swin2SRConfig base_model_prefix = "swin2sr" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/swinv2/modeling_swinv2.py b/src/transformers/models/swinv2/modeling_swinv2.py index 14ec4791ac..3f0c868b06 100644 --- a/src/transformers/models/swinv2/modeling_swinv2.py +++ b/src/transformers/models/swinv2/modeling_swinv2.py @@ -919,7 +919,7 @@ class Swinv2Encoder(nn.Module): @auto_docstring class Swinv2PreTrainedModel(PreTrainedModel): - config_class = Swinv2Config + config: Swinv2Config base_model_prefix = "swinv2" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py index 7ede856b41..2c5a5694e4 100644 --- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py +++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py @@ -762,7 +762,7 @@ class SwitchTransformersBlock(GradientCheckpointingLayer): @auto_docstring class SwitchTransformersPreTrainedModel(PreTrainedModel): - config_class = SwitchTransformersConfig + config: SwitchTransformersConfig base_model_prefix = "switch_transformers" supports_gradient_checkpointing = True diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py index 23a43615b1..472444220c 100644 --- a/src/transformers/models/t5/modeling_t5.py +++ b/src/transformers/models/t5/modeling_t5.py @@ -766,7 +766,7 @@ class T5ClassificationHead(nn.Module): @auto_docstring class T5PreTrainedModel(PreTrainedModel): - config_class = T5Config + config: T5Config load_tf_weights = load_tf_weights_in_t5 base_model_prefix = "transformer" is_parallelizable = True diff --git a/src/transformers/models/t5gemma/modeling_t5gemma.py b/src/transformers/models/t5gemma/modeling_t5gemma.py index 2cd8798883..cf40994dde 100644 --- a/src/transformers/models/t5gemma/modeling_t5gemma.py +++ b/src/transformers/models/t5gemma/modeling_t5gemma.py @@ -576,7 +576,7 @@ class T5GemmaAttention(nn.Module): @auto_docstring class T5GemmaPreTrainedModel(PreTrainedModel): - config_class = T5GemmaConfig + config: T5GemmaConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["T5GemmaBlock"] diff --git a/src/transformers/models/t5gemma/modular_t5gemma.py b/src/transformers/models/t5gemma/modular_t5gemma.py index 9360008e30..4f416ef6b2 100644 --- a/src/transformers/models/t5gemma/modular_t5gemma.py +++ b/src/transformers/models/t5gemma/modular_t5gemma.py @@ -478,7 +478,7 @@ class T5GemmaLMHead(nn.Module): @auto_docstring class T5GemmaPreTrainedModel(Gemma2PreTrainedModel): - config_class = T5GemmaConfig + config: T5GemmaConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["T5GemmaBlock"] diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py index 2f0eb777c0..5f0729ba3a 100644 --- a/src/transformers/models/table_transformer/modeling_table_transformer.py +++ b/src/transformers/models/table_transformer/modeling_table_transformer.py @@ -680,7 +680,7 @@ class TableTransformerDecoderLayer(GradientCheckpointingLayer): @auto_docstring class TableTransformerPreTrainedModel(PreTrainedModel): - config_class = TableTransformerConfig + config: TableTransformerConfig base_model_prefix = "model" main_input_name = "pixel_values" _no_split_modules = [ diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py index 8545bc1021..a4c0f68fbc 100644 --- a/src/transformers/models/tapas/modeling_tapas.py +++ b/src/transformers/models/tapas/modeling_tapas.py @@ -698,7 +698,7 @@ class TapasOnlyMLMHead(nn.Module): @auto_docstring class TapasPreTrainedModel(PreTrainedModel): - config_class = TapasConfig + config: TapasConfig base_model_prefix = "tapas" supports_gradient_checkpointing = True _supports_param_buffer_assignment = False @@ -892,7 +892,7 @@ class TapasModel(TapasPreTrainedModel): @auto_docstring class TapasForMaskedLM(TapasPreTrainedModel): _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] - config_class = TapasConfig + config: TapasConfig base_model_prefix = "tapas" def __init__(self, config): diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py index 02c0fe4582..71496763b4 100644 --- a/src/transformers/models/textnet/modeling_textnet.py +++ b/src/transformers/models/textnet/modeling_textnet.py @@ -218,7 +218,7 @@ class TextNetEncoder(nn.Module): @auto_docstring class TextNetPreTrainedModel(PreTrainedModel): - config_class = TextNetConfig + config: TextNetConfig base_model_prefix = "textnet" main_input_name = "pixel_values" diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py index 3e74c55b8f..d03430002f 100644 --- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py +++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py @@ -626,7 +626,7 @@ class TimeSeriesTransformerDecoderLayer(GradientCheckpointingLayer): @auto_docstring class TimeSeriesTransformerPreTrainedModel(PreTrainedModel): - config_class = TimeSeriesTransformerConfig + config: TimeSeriesTransformerConfig base_model_prefix = "model" main_input_name = "past_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/timesfm/modeling_timesfm.py b/src/transformers/models/timesfm/modeling_timesfm.py index 5fc07ae1e0..8a6686ac80 100644 --- a/src/transformers/models/timesfm/modeling_timesfm.py +++ b/src/transformers/models/timesfm/modeling_timesfm.py @@ -299,7 +299,7 @@ class TimesFmDecoderLayer(nn.Module): @auto_docstring class TimesFmPreTrainedModel(PreTrainedModel): - config_class = TimesFmConfig + config: TimesFmConfig base_model_prefix = "timesfm" _no_split_modules = ["TimesFmDecoderLayer"] main_input_name = "past_values" diff --git a/src/transformers/models/timesfm/modular_timesfm.py b/src/transformers/models/timesfm/modular_timesfm.py index 5c8f4f0d33..683537637b 100644 --- a/src/transformers/models/timesfm/modular_timesfm.py +++ b/src/transformers/models/timesfm/modular_timesfm.py @@ -255,7 +255,7 @@ class TimesFmDecoderLayer(nn.Module): @auto_docstring class TimesFmPreTrainedModel(PreTrainedModel): - config_class = TimesFmConfig + config: TimesFmConfig base_model_prefix = "timesfm" _no_split_modules = ["TimesFmDecoderLayer"] main_input_name = "past_values" diff --git a/src/transformers/models/timesformer/modeling_timesformer.py b/src/transformers/models/timesformer/modeling_timesformer.py index 191a65f9b1..c0110b379a 100644 --- a/src/transformers/models/timesformer/modeling_timesformer.py +++ b/src/transformers/models/timesformer/modeling_timesformer.py @@ -454,7 +454,7 @@ class TimesformerEncoder(nn.Module): @auto_docstring class TimesformerPreTrainedModel(PreTrainedModel): - config_class = TimesformerConfig + config: TimesformerConfig base_model_prefix = "timesformer" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/timm_backbone/modeling_timm_backbone.py b/src/transformers/models/timm_backbone/modeling_timm_backbone.py index 91e0224866..4959f76d58 100644 --- a/src/transformers/models/timm_backbone/modeling_timm_backbone.py +++ b/src/transformers/models/timm_backbone/modeling_timm_backbone.py @@ -40,7 +40,7 @@ class TimmBackbone(PreTrainedModel, BackboneMixin): main_input_name = "pixel_values" supports_gradient_checkpointing = False - config_class = TimmBackboneConfig + config: TimmBackboneConfig def __init__(self, config, **kwargs): requires_backends(self, "timm") diff --git a/src/transformers/models/timm_wrapper/modeling_timm_wrapper.py b/src/transformers/models/timm_wrapper/modeling_timm_wrapper.py index f36eb5382b..34893bfdf9 100644 --- a/src/transformers/models/timm_wrapper/modeling_timm_wrapper.py +++ b/src/transformers/models/timm_wrapper/modeling_timm_wrapper.py @@ -59,7 +59,7 @@ class TimmWrapperModelOutput(ModelOutput): @auto_docstring class TimmWrapperPreTrainedModel(PreTrainedModel): main_input_name = "pixel_values" - config_class = TimmWrapperConfig + config: TimmWrapperConfig _no_split_modules = [] model_tags = ["timm"] diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py index ca10d41d6c..eb7ee35f49 100644 --- a/src/transformers/models/trocr/modeling_trocr.py +++ b/src/transformers/models/trocr/modeling_trocr.py @@ -417,7 +417,7 @@ class TrOCRDecoderLayer(GradientCheckpointingLayer): @auto_docstring class TrOCRPreTrainedModel(PreTrainedModel): - config_class = TrOCRConfig + config: TrOCRConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["TrOCRDecoderLayer"] diff --git a/src/transformers/models/tvp/modeling_tvp.py b/src/transformers/models/tvp/modeling_tvp.py index 8dfb0851d8..41ef6c3d38 100644 --- a/src/transformers/models/tvp/modeling_tvp.py +++ b/src/transformers/models/tvp/modeling_tvp.py @@ -551,7 +551,7 @@ class TvpPooler(nn.Module): @auto_docstring class TvpPreTrainedModel(PreTrainedModel): - config_class = TvpConfig + config: TvpConfig base_model_prefix = "model" supports_gradient_checkpointing = True diff --git a/src/transformers/models/udop/modeling_udop.py b/src/transformers/models/udop/modeling_udop.py index a0848a667b..27ed407d4a 100644 --- a/src/transformers/models/udop/modeling_udop.py +++ b/src/transformers/models/udop/modeling_udop.py @@ -251,7 +251,7 @@ class UdopPatchEmbeddings(nn.Module): @auto_docstring class UdopPreTrainedModel(PreTrainedModel): - config_class = UdopConfig + config: UdopConfig base_model_prefix = "transformer" supports_gradient_checkpointing = True diff --git a/src/transformers/models/umt5/modeling_umt5.py b/src/transformers/models/umt5/modeling_umt5.py index 62bbd9d7f7..b39c63d43a 100644 --- a/src/transformers/models/umt5/modeling_umt5.py +++ b/src/transformers/models/umt5/modeling_umt5.py @@ -504,7 +504,7 @@ class UMT5ClassificationHead(nn.Module): @auto_docstring class UMT5PreTrainedModel(PreTrainedModel): - config_class = UMT5Config + config: UMT5Config base_model_prefix = "transformer" supports_gradient_checkpointing = True @@ -951,7 +951,7 @@ class UMT5Model(UMT5PreTrainedModel): ```""" model_type = "umt5" - config_class = UMT5Config + config: UMT5Config _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config): diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index 195923642e..46aaecbbcc 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -781,7 +781,7 @@ class UniSpeechGumbelVectorQuantizer(nn.Module): @auto_docstring class UniSpeechPreTrainedModel(PreTrainedModel): - config_class = UniSpeechConfig + config: UniSpeechConfig base_model_prefix = "unispeech" main_input_name = "input_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/unispeech/modular_unispeech.py b/src/transformers/models/unispeech/modular_unispeech.py index 0f4a98f9be..e05031ffc6 100644 --- a/src/transformers/models/unispeech/modular_unispeech.py +++ b/src/transformers/models/unispeech/modular_unispeech.py @@ -138,7 +138,7 @@ class UniSpeechGumbelVectorQuantizer(Wav2Vec2GumbelVectorQuantizer): @auto_docstring class UniSpeechPreTrainedModel(PreTrainedModel): - config_class = UniSpeechConfig + config: UniSpeechConfig base_model_prefix = "unispeech" main_input_name = "input_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py index 62aaff87b7..c32a8a35a1 100755 --- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py @@ -786,7 +786,7 @@ class UniSpeechSatGumbelVectorQuantizer(nn.Module): @auto_docstring class UniSpeechSatPreTrainedModel(PreTrainedModel): - config_class = UniSpeechSatConfig + config: UniSpeechSatConfig base_model_prefix = "unispeech_sat" main_input_name = "input_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/unispeech_sat/modular_unispeech_sat.py b/src/transformers/models/unispeech_sat/modular_unispeech_sat.py index 087f83f958..2c5129e7d5 100644 --- a/src/transformers/models/unispeech_sat/modular_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modular_unispeech_sat.py @@ -150,7 +150,7 @@ class UniSpeechSatGumbelVectorQuantizer(Wav2Vec2GumbelVectorQuantizer): @auto_docstring class UniSpeechSatPreTrainedModel(PreTrainedModel): - config_class = UniSpeechSatConfig + config: UniSpeechSatConfig base_model_prefix = "unispeech_sat" main_input_name = "input_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/univnet/modeling_univnet.py b/src/transformers/models/univnet/modeling_univnet.py index 5afbd669ae..1a9c2aa54e 100644 --- a/src/transformers/models/univnet/modeling_univnet.py +++ b/src/transformers/models/univnet/modeling_univnet.py @@ -426,7 +426,7 @@ class UnivNetLvcBlock(nn.Module): @auto_docstring class UnivNetModel(PreTrainedModel): - config_class = UnivNetConfig + config: UnivNetConfig main_input_name = "input_features" def __init__(self, config: UnivNetConfig): diff --git a/src/transformers/models/upernet/modeling_upernet.py b/src/transformers/models/upernet/modeling_upernet.py index a531fa2abe..36dc90c30a 100644 --- a/src/transformers/models/upernet/modeling_upernet.py +++ b/src/transformers/models/upernet/modeling_upernet.py @@ -267,7 +267,7 @@ class UperNetFCNHead(nn.Module): @auto_docstring class UperNetPreTrainedModel(PreTrainedModel): - config_class = UperNetConfig + config: UperNetConfig main_input_name = "pixel_values" _no_split_modules = [] diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index b7f55915f1..215e190502 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -126,7 +126,7 @@ class VideoLlavaMultiModalProjector(nn.Module): @auto_docstring class VideoLlavaPreTrainedModel(PreTrainedModel): - config_class = VideoLlavaConfig + config: VideoLlavaConfig base_model_prefix = "" supports_gradient_checkpointing = True _no_split_modules = ["VideoLlavaVisionAttention"] diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py index d7c93bdda0..7b22eb9a49 100755 --- a/src/transformers/models/videomae/modeling_videomae.py +++ b/src/transformers/models/videomae/modeling_videomae.py @@ -462,7 +462,7 @@ class VideoMAEEncoder(nn.Module): @auto_docstring class VideoMAEPreTrainedModel(PreTrainedModel): - config_class = VideoMAEConfig + config: VideoMAEConfig base_model_prefix = "videomae" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py index 2600605fc6..8fc846addc 100755 --- a/src/transformers/models/vilt/modeling_vilt.py +++ b/src/transformers/models/vilt/modeling_vilt.py @@ -545,7 +545,7 @@ class ViltEncoder(nn.Module): @auto_docstring class ViltPreTrainedModel(PreTrainedModel): - config_class = ViltConfig + config: ViltConfig base_model_prefix = "vilt" supports_gradient_checkpointing = True _no_split_modules = ["ViltEmbeddings", "ViltSelfAttention"] diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index 9a9c986562..14fd0d6376 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -114,7 +114,7 @@ class VipLlavaMultiModalProjector(nn.Module): @auto_docstring class VipLlavaPreTrainedModel(PreTrainedModel): - config_class = VipLlavaConfig + config: VipLlavaConfig base_model_prefix = "" supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py index 70d3ccedee..96d8228ea5 100644 --- a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +++ b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py @@ -63,7 +63,7 @@ class VisionEncoderDecoderModel(PreTrainedModel, GenerationMixin): :meth*~transformers.AutoModelForCausalLM.from_pretrained* class method for the decoder. """ - config_class = VisionEncoderDecoderConfig + config: VisionEncoderDecoderConfig base_model_prefix = "vision_encoder_decoder" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py index f0d806c311..fb1abc020a 100755 --- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py @@ -44,7 +44,7 @@ def clip_loss(similarity: torch.Tensor) -> torch.Tensor: @auto_docstring class VisionTextDualEncoderModel(PreTrainedModel): - config_class = VisionTextDualEncoderConfig + config: VisionTextDualEncoderConfig base_model_prefix = "vision_text_dual_encoder" _supports_flash_attn = True _supports_sdpa = True diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py index 255406c6ce..b8b50d3024 100755 --- a/src/transformers/models/visual_bert/modeling_visual_bert.py +++ b/src/transformers/models/visual_bert/modeling_visual_bert.py @@ -497,7 +497,7 @@ class VisualBertPreTrainingHeads(nn.Module): @auto_docstring class VisualBertPreTrainedModel(PreTrainedModel): - config_class = VisualBertConfig + config: VisualBertConfig base_model_prefix = "visual_bert" supports_gradient_checkpointing = True diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py index 58738c0063..298023b8f8 100644 --- a/src/transformers/models/vit/modeling_vit.py +++ b/src/transformers/models/vit/modeling_vit.py @@ -445,7 +445,7 @@ class ViTEncoder(nn.Module): @auto_docstring class ViTPreTrainedModel(PreTrainedModel): - config_class = ViTConfig + config: ViTConfig base_model_prefix = "vit" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/vit_mae/modeling_vit_mae.py b/src/transformers/models/vit_mae/modeling_vit_mae.py index 31ee70ad34..c0a5fe77a4 100755 --- a/src/transformers/models/vit_mae/modeling_vit_mae.py +++ b/src/transformers/models/vit_mae/modeling_vit_mae.py @@ -611,7 +611,7 @@ class ViTMAEEncoder(nn.Module): @auto_docstring class ViTMAEPreTrainedModel(PreTrainedModel): - config_class = ViTMAEConfig + config: ViTMAEConfig base_model_prefix = "vit" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/vit_msn/modeling_vit_msn.py b/src/transformers/models/vit_msn/modeling_vit_msn.py index c3640dadef..beb4814f09 100644 --- a/src/transformers/models/vit_msn/modeling_vit_msn.py +++ b/src/transformers/models/vit_msn/modeling_vit_msn.py @@ -449,7 +449,7 @@ class ViTMSNEncoder(nn.Module): @auto_docstring class ViTMSNPreTrainedModel(PreTrainedModel): - config_class = ViTMSNConfig + config: ViTMSNConfig base_model_prefix = "vit" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/vitdet/modeling_vitdet.py b/src/transformers/models/vitdet/modeling_vitdet.py index b74bc1008f..1498b72d85 100644 --- a/src/transformers/models/vitdet/modeling_vitdet.py +++ b/src/transformers/models/vitdet/modeling_vitdet.py @@ -596,7 +596,7 @@ def caffe2_msra_fill(module: nn.Module) -> None: @auto_docstring class VitDetPreTrainedModel(PreTrainedModel): - config_class = VitDetConfig + config: VitDetConfig base_model_prefix = "vitdet" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/vitmatte/modeling_vitmatte.py b/src/transformers/models/vitmatte/modeling_vitmatte.py index deea920d24..e91732fe74 100644 --- a/src/transformers/models/vitmatte/modeling_vitmatte.py +++ b/src/transformers/models/vitmatte/modeling_vitmatte.py @@ -52,7 +52,7 @@ class ImageMattingOutput(ModelOutput): @auto_docstring class VitMattePreTrainedModel(PreTrainedModel): - config_class = VitMatteConfig + config: VitMatteConfig main_input_name = "pixel_values" supports_gradient_checkpointing = True _no_split_modules = [] diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py index 5752621990..d5e6aa2535 100644 --- a/src/transformers/models/vitpose/modeling_vitpose.py +++ b/src/transformers/models/vitpose/modeling_vitpose.py @@ -62,7 +62,7 @@ class VitPoseEstimatorOutput(ModelOutput): @auto_docstring class VitPosePreTrainedModel(PreTrainedModel): - config_class = VitPoseConfig + config: VitPoseConfig base_model_prefix = "vit" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py index 86d85bb53b..47a32c83a2 100644 --- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py @@ -410,7 +410,7 @@ class VitPoseBackboneEncoder(nn.Module): @auto_docstring class VitPoseBackbonePreTrainedModel(PreTrainedModel): - config_class = VitPoseBackboneConfig + config: VitPoseBackboneConfig base_model_prefix = "vit" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/vits/modeling_vits.py b/src/transformers/models/vits/modeling_vits.py index 6d66f7d4f5..e7492960bd 100644 --- a/src/transformers/models/vits/modeling_vits.py +++ b/src/transformers/models/vits/modeling_vits.py @@ -1213,7 +1213,7 @@ class VitsTextEncoder(nn.Module): @auto_docstring class VitsPreTrainedModel(PreTrainedModel): - config_class = VitsConfig + config: VitsConfig base_model_prefix = "vits" main_input_name = "input_ids" supports_gradient_checkpointing = True diff --git a/src/transformers/models/vivit/modeling_vivit.py b/src/transformers/models/vivit/modeling_vivit.py index 54f07e0be0..e37f713025 100755 --- a/src/transformers/models/vivit/modeling_vivit.py +++ b/src/transformers/models/vivit/modeling_vivit.py @@ -453,7 +453,7 @@ class VivitPooler(nn.Module): @auto_docstring class VivitPreTrainedModel(PreTrainedModel): - config_class = VivitConfig + config: VivitConfig base_model_prefix = "vivit" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/vjepa2/modeling_vjepa2.py b/src/transformers/models/vjepa2/modeling_vjepa2.py index 0a7326b916..fb27ffdb6a 100644 --- a/src/transformers/models/vjepa2/modeling_vjepa2.py +++ b/src/transformers/models/vjepa2/modeling_vjepa2.py @@ -981,7 +981,7 @@ class VJEPA2AttentivePooler(nn.Module): @auto_docstring class VJEPA2PreTrainedModel(PreTrainedModel): - config_class = VJEPA2Config + config: VJEPA2Config base_model_prefix = "vjepa2" main_input_name = "pixel_values_videos" supports_gradient_checkpointing = True diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index a5ee3378c1..d92519f890 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -1030,7 +1030,7 @@ class Wav2Vec2AttnAdapterLayer(nn.Module): @auto_docstring class Wav2Vec2PreTrainedModel(PreTrainedModel): - config_class = Wav2Vec2Config + config: Wav2Vec2Config base_model_prefix = "wav2vec2" main_input_name = "input_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py index 67755e38c8..e416fd8a66 100644 --- a/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +++ b/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py @@ -707,7 +707,7 @@ class Wav2Vec2BertAdapterLayer(nn.Module): @auto_docstring class Wav2Vec2BertPreTrainedModel(PreTrainedModel): - config_class = Wav2Vec2BertConfig + config: Wav2Vec2BertConfig base_model_prefix = "wav2vec2_bert" main_input_name = "input_features" supports_gradient_checkpointing = True diff --git a/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py index 6664506a49..d029d22cd3 100644 --- a/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +++ b/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py @@ -579,7 +579,7 @@ class Wav2Vec2BertAdapterLayer(nn.Module): @auto_docstring class Wav2Vec2BertPreTrainedModel(PreTrainedModel): - config_class = Wav2Vec2BertConfig + config: Wav2Vec2BertConfig base_model_prefix = "wav2vec2_bert" main_input_name = "input_features" supports_gradient_checkpointing = True diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py index 9cc47da5d1..bdc3dcddaf 100644 --- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py @@ -851,7 +851,7 @@ class Wav2Vec2ConformerAdapterLayer(nn.Module): @auto_docstring class Wav2Vec2ConformerPreTrainedModel(PreTrainedModel): - config_class = Wav2Vec2ConformerConfig + config: Wav2Vec2ConformerConfig base_model_prefix = "wav2vec2_conformer" main_input_name = "input_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py index c239585254..b54e7d0259 100644 --- a/src/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +++ b/src/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py @@ -546,7 +546,7 @@ class Wav2Vec2ConformerAdapterLayer(Wav2Vec2AdapterLayer): @auto_docstring class Wav2Vec2ConformerPreTrainedModel(PreTrainedModel): - config_class = Wav2Vec2ConformerConfig + config: Wav2Vec2ConformerConfig base_model_prefix = "wav2vec2_conformer" main_input_name = "input_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py index 9f571c5dba..b6bf15794f 100755 --- a/src/transformers/models/wavlm/modeling_wavlm.py +++ b/src/transformers/models/wavlm/modeling_wavlm.py @@ -594,7 +594,7 @@ class WavLMGumbelVectorQuantizer(nn.Module): @auto_docstring class WavLMPreTrainedModel(PreTrainedModel): - config_class = WavLMConfig + config: WavLMConfig base_model_prefix = "wavlm" main_input_name = "input_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/wavlm/modular_wavlm.py b/src/transformers/models/wavlm/modular_wavlm.py index 7666ed0561..94b991dffb 100644 --- a/src/transformers/models/wavlm/modular_wavlm.py +++ b/src/transformers/models/wavlm/modular_wavlm.py @@ -504,7 +504,7 @@ class WavLMGumbelVectorQuantizer(nn.Module): class WavLMPreTrainedModel(PreTrainedModel, Wav2Vec2PreTrainedModel): - config_class = WavLMConfig + config: WavLMConfig base_model_prefix = "wavlm" main_input_name = "input_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py index a03353c688..1ddd286f3f 100644 --- a/src/transformers/models/whisper/modeling_whisper.py +++ b/src/transformers/models/whisper/modeling_whisper.py @@ -544,7 +544,7 @@ class WhisperDecoderLayer(GradientCheckpointingLayer): @auto_docstring class WhisperPreTrainedModel(PreTrainedModel): - config_class = WhisperConfig + config: WhisperConfig base_model_prefix = "model" main_input_name = "input_features" supports_gradient_checkpointing = True diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py index f33082d261..5a2916585d 100644 --- a/src/transformers/models/x_clip/modeling_x_clip.py +++ b/src/transformers/models/x_clip/modeling_x_clip.py @@ -509,7 +509,7 @@ class XCLIPVisionEncoderLayer(GradientCheckpointingLayer): @auto_docstring class XCLIPPreTrainedModel(PreTrainedModel): - config_class = XCLIPConfig + config: XCLIPConfig base_model_prefix = "x_clip" supports_gradient_checkpointing = True @@ -720,7 +720,7 @@ class XCLIPTextTransformer(nn.Module): class XCLIPTextModel(XCLIPPreTrainedModel): - config_class = XCLIPTextConfig + config: XCLIPTextConfig def __init__(self, config: XCLIPTextConfig): super().__init__(config) @@ -913,7 +913,7 @@ class XCLIPVisionTransformer(nn.Module): class XCLIPVisionModel(XCLIPPreTrainedModel): - config_class = XCLIPVisionConfig + config: XCLIPVisionConfig main_input_name = "pixel_values" def __init__(self, config: XCLIPVisionConfig): @@ -1154,7 +1154,7 @@ class XCLIPPromptGenerator(nn.Module): @auto_docstring class XCLIPModel(XCLIPPreTrainedModel): - config_class = XCLIPConfig + config: XCLIPConfig def __init__(self, config: XCLIPConfig): super().__init__(config) diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py index 15fc5c2178..d5c8b612ef 100755 --- a/src/transformers/models/xglm/modeling_xglm.py +++ b/src/transformers/models/xglm/modeling_xglm.py @@ -375,7 +375,7 @@ class XGLMDecoderLayer(GradientCheckpointingLayer): @auto_docstring class XGLMPreTrainedModel(PreTrainedModel): - config_class = XGLMConfig + config: XGLMConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["XGLMDecoderLayer"] diff --git a/src/transformers/models/xlm/modeling_xlm.py b/src/transformers/models/xlm/modeling_xlm.py index b92b3d06d2..57b88811e6 100755 --- a/src/transformers/models/xlm/modeling_xlm.py +++ b/src/transformers/models/xlm/modeling_xlm.py @@ -621,7 +621,7 @@ class TransformerFFN(nn.Module): @auto_docstring class XLMPreTrainedModel(PreTrainedModel): - config_class = XLMConfig + config: XLMConfig load_tf_weights = None base_model_prefix = "transformer" diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py index 43c6680da6..57aa4385bb 100644 --- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py @@ -692,7 +692,7 @@ class XLMRobertaPooler(nn.Module): @auto_docstring # Copied from transformers.models.roberta.modeling_roberta.RobertaPreTrainedModel with Roberta->XLMRoberta class XLMRobertaPreTrainedModel(PreTrainedModel): - config_class = XLMRobertaConfig + config: XLMRobertaConfig base_model_prefix = "roberta" supports_gradient_checkpointing = True _no_split_modules = ["XLMRobertaEmbeddings", "XLMRobertaSelfAttention", "XLMRobertaSdpaSelfAttention"] diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py index 7e3592847b..10f1df128a 100644 --- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py @@ -686,7 +686,7 @@ class XLMRobertaXLPooler(nn.Module): @auto_docstring class XLMRobertaXLPreTrainedModel(PreTrainedModel): - config_class = XLMRobertaXLConfig + config: XLMRobertaXLConfig base_model_prefix = "roberta" _no_split_modules = ["XLMRobertaXLEmbeddings", "XLMRobertaXLLayer"] _supports_sdpa = True diff --git a/src/transformers/models/xlnet/modeling_xlnet.py b/src/transformers/models/xlnet/modeling_xlnet.py index 315d35bbd7..736521ee95 100755 --- a/src/transformers/models/xlnet/modeling_xlnet.py +++ b/src/transformers/models/xlnet/modeling_xlnet.py @@ -796,7 +796,7 @@ class XLNetSequenceSummary(nn.Module): @auto_docstring class XLNetPreTrainedModel(PreTrainedModel): - config_class = XLNetConfig + config: XLNetConfig load_tf_weights = load_tf_weights_in_xlnet base_model_prefix = "transformer" diff --git a/src/transformers/models/xmod/modeling_xmod.py b/src/transformers/models/xmod/modeling_xmod.py index 6266ec88f5..10baf5b6e3 100644 --- a/src/transformers/models/xmod/modeling_xmod.py +++ b/src/transformers/models/xmod/modeling_xmod.py @@ -628,7 +628,7 @@ class XmodPooler(nn.Module): @auto_docstring class XmodPreTrainedModel(PreTrainedModel): - config_class = XmodConfig + config: XmodConfig base_model_prefix = "roberta" supports_gradient_checkpointing = True diff --git a/src/transformers/models/yolos/modeling_yolos.py b/src/transformers/models/yolos/modeling_yolos.py index e686f9741b..b201ba72e4 100755 --- a/src/transformers/models/yolos/modeling_yolos.py +++ b/src/transformers/models/yolos/modeling_yolos.py @@ -523,7 +523,7 @@ class YolosEncoder(nn.Module): @auto_docstring class YolosPreTrainedModel(PreTrainedModel): - config_class = YolosConfig + config: YolosConfig base_model_prefix = "vit" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/yoso/modeling_yoso.py b/src/transformers/models/yoso/modeling_yoso.py index 1d999ea4ca..442f776a5e 100644 --- a/src/transformers/models/yoso/modeling_yoso.py +++ b/src/transformers/models/yoso/modeling_yoso.py @@ -639,7 +639,7 @@ class YosoOnlyMLMHead(nn.Module): @auto_docstring class YosoPreTrainedModel(PreTrainedModel): - config_class = YosoConfig + config: YosoConfig base_model_prefix = "yoso" supports_gradient_checkpointing = True diff --git a/src/transformers/models/zamba/modeling_zamba.py b/src/transformers/models/zamba/modeling_zamba.py index 317ae28e44..205f4d1eac 100644 --- a/src/transformers/models/zamba/modeling_zamba.py +++ b/src/transformers/models/zamba/modeling_zamba.py @@ -781,7 +781,7 @@ class ZambaHybridLayer(nn.Module): @auto_docstring class ZambaPreTrainedModel(PreTrainedModel): - config_class = ZambaConfig + config: ZambaConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["ZambaAttentionDecoderLayer", "ZambaMambaDecoderLayer"] diff --git a/src/transformers/models/zamba2/modeling_zamba2.py b/src/transformers/models/zamba2/modeling_zamba2.py index 45e638ba6f..c932e257df 100644 --- a/src/transformers/models/zamba2/modeling_zamba2.py +++ b/src/transformers/models/zamba2/modeling_zamba2.py @@ -1171,7 +1171,7 @@ class Zamba2HybridLayer(nn.Module): class Zamba2PreTrainedModel(PreTrainedModel): - config_class = Zamba2Config + config: Zamba2Config base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["Zamba2AttentionDecoderLayer", "Zamba2MambaDecoderLayer"] diff --git a/src/transformers/models/zamba2/modular_zamba2.py b/src/transformers/models/zamba2/modular_zamba2.py index b912a63419..05565c60d6 100644 --- a/src/transformers/models/zamba2/modular_zamba2.py +++ b/src/transformers/models/zamba2/modular_zamba2.py @@ -894,7 +894,7 @@ class Zamba2HybridLayer(ZambaHybridLayer): class Zamba2PreTrainedModel(PreTrainedModel): - config_class = Zamba2Config + config: Zamba2Config base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["Zamba2AttentionDecoderLayer", "Zamba2MambaDecoderLayer"] diff --git a/src/transformers/models/zoedepth/modeling_zoedepth.py b/src/transformers/models/zoedepth/modeling_zoedepth.py index 9004256338..3d0aaca7b1 100644 --- a/src/transformers/models/zoedepth/modeling_zoedepth.py +++ b/src/transformers/models/zoedepth/modeling_zoedepth.py @@ -1206,7 +1206,7 @@ class ZoeDepthMetricDepthEstimationHead(nn.Module): # avoiding sdpa and flash_attn_2 support, it's done int the backend @auto_docstring class ZoeDepthPreTrainedModel(PreTrainedModel): - config_class = ZoeDepthConfig + config: ZoeDepthConfig base_model_prefix = "zoedepth" main_input_name = "pixel_values" supports_gradient_checkpointing = True diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py index 4da0aefce9..57d97ff214 100644 --- a/tests/utils/test_modeling_utils.py +++ b/tests/utils/test_modeling_utils.py @@ -2004,6 +2004,37 @@ class ModelUtilsTest(TestCasePlus): self.assertTrue(explicit_transformers_weights not in os.listdir(tmpdirname)) self.assertTrue("model.safetensors.index.json" in os.listdir(tmpdirname)) + def test_config_class_attribute(self): + # custom configs + class MyConfigA(PretrainedConfig): + pass + + class MyConfigB(PretrainedConfig): + pass + + class MyConfigC(PretrainedConfig): + pass + + # custom models + class MyModelA(PreTrainedModel): + config: dict + config_class = MyConfigA + + class MyModelB(MyModelA): + config: MyConfigB + + class MyModelC(MyModelA): + config_class = MyConfigC + + class MyModelD(MyModelA): + pass + + # child config_class > child 'config:' > parent config_class > parent 'config:' + self.assertIs(MyModelA.config_class, MyConfigA) + self.assertIs(MyModelB.config_class, MyConfigB) + self.assertIs(MyModelC.config_class, MyConfigC) + self.assertIs(MyModelD.config_class, MyConfigA) + @slow @require_torch diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py index 731839b062..1f32716c8f 100644 --- a/utils/modular_model_converter.py +++ b/utils/modular_model_converter.py @@ -945,30 +945,20 @@ def replace_class_node( new_class_docstring = modular_docstring if len(modular_docstring) > 0 else original_modeling_docstring # Compute new class attributes - original_modeling_class_attributes = { - node.body[0].targets[0].target.value: node - for node in original_modeling_node.body.body - if m.matches(node, m.SimpleStatementLine(body=[m.Assign()])) - } - original_modeling_class_attributes.update( - { - node.body[0].target.value: node - for node in original_modeling_node.body.body - if m.matches(node, m.SimpleStatementLine(body=[m.AnnAssign()])) - } - ) - modular_class_attributes = { - node.body[0].targets[0].target.value: node - for node in modular_class_node.body.body - if m.matches(node, m.SimpleStatementLine(body=[m.Assign()])) - } - modular_class_attributes.update( - { - node.body[0].target.value: node - for node in modular_class_node.body.body - if m.matches(node, m.SimpleStatementLine(body=[m.AnnAssign()])) - } - ) + original_modeling_class_attributes = {} + for node in original_modeling_node.body.body: + if m.matches(node, m.SimpleStatementLine(body=[m.Assign()])): + original_modeling_class_attributes[node.body[0].targets[0].target.value] = node + elif m.matches(node, m.SimpleStatementLine(body=[m.AnnAssign()])): + original_modeling_class_attributes[node.body[0].target.value] = node + + modular_class_attributes = {} + for node in modular_class_node.body.body: + if m.matches(node, m.SimpleStatementLine(body=[m.Assign()])): + modular_class_attributes[node.body[0].targets[0].target.value] = node + elif m.matches(node, m.SimpleStatementLine(body=[m.AnnAssign()])): + modular_class_attributes[node.body[0].target.value] = node + # Use all original modeling attributes, and potentially override some with values in the modular new_class_attributes = list({**original_modeling_class_attributes, **modular_class_attributes}.values())