From 6b3a1f2f5100a84e40138382a6955627b3b865ba Mon Sep 17 00:00:00 2001 From: BUI Van Tuan <37981884+bvantuan@users.noreply.github.com> Date: Mon, 21 Jul 2025 14:43:52 +0200 Subject: [PATCH] Fix missing initializations for models created in 2023 (#39239) * fix SwiftFormer * fix Kosmos2 * fix Owlv2 * fix Sam * fix Vits * fix Pvt * fix MobileViTV2 * fix PatchTST * fix Bros * fix Informer * fix BridgeTower * fix Mra and Yoso * fix Rwkv * fix EfficientNet * fix NllbMoe * fix Tvp * fix Clap * fix Autoformer * fix SwiftFormer * fix Mgpstr * fix Align * fix VitMatte * fix SpeechT5 * add conditional check for parameters * fix SpeechT5 * fix TimmBackbone and Clvp * fix SwiftFormer * fix SeamlessM4T and SeamlessM4Tv2 * fix Align * fix Owlv2 and OwlViT * add reviewed changes * add reviewed changes * fix typo --------- Co-authored-by: Cyril Vallez --- .../models/align/modeling_align.py | 11 +++-- .../models/autoformer/modeling_autoformer.py | 5 +- .../bridgetower/modeling_bridgetower.py | 36 +++++++------- src/transformers/models/bros/modeling_bros.py | 9 ++-- src/transformers/models/clap/modeling_clap.py | 11 +++-- src/transformers/models/clvp/modeling_clvp.py | 10 ++-- .../efficientnet/modeling_efficientnet.py | 7 +-- .../modeling_fastspeech2_conformer.py | 4 +- .../models/informer/modeling_informer.py | 5 +- .../models/informer/modular_informer.py | 5 +- .../models/kosmos2/modeling_kosmos2.py | 47 ++++--------------- .../models/mgp_str/modeling_mgp_str.py | 15 ++++-- .../models/mobilevit/modeling_mobilevit.py | 4 +- .../mobilevitv2/modeling_mobilevitv2.py | 7 ++- src/transformers/models/mra/modeling_mra.py | 9 ++-- .../models/nllb_moe/modeling_nllb_moe.py | 5 +- .../omdet_turbo/modeling_omdet_turbo.py | 4 +- .../models/owlv2/modeling_owlv2.py | 19 ++++---- .../models/owlvit/modeling_owlvit.py | 16 +++---- .../models/patchtst/modeling_patchtst.py | 12 +++-- src/transformers/models/pvt/modeling_pvt.py | 11 +++-- src/transformers/models/rwkv/modeling_rwkv.py | 38 +++++++++++---- src/transformers/models/sam/modeling_sam.py | 5 +- .../models/sam_hq/modeling_sam_hq.py | 4 +- .../models/sam_hq/modular_sam_hq.py | 3 -- .../seamless_m4t/modeling_seamless_m4t.py | 16 ++++--- .../modeling_seamless_m4t_v2.py | 19 +++++--- .../models/speecht5/modeling_speecht5.py | 20 +++++--- .../swiftformer/modeling_swiftformer.py | 12 ++++- src/transformers/models/tvp/modeling_tvp.py | 21 ++++++--- .../models/vitmatte/modeling_vitmatte.py | 4 +- src/transformers/models/vits/modeling_vits.py | 17 +++++-- src/transformers/models/yoso/modeling_yoso.py | 9 ++-- tests/models/clap/test_modeling_clap.py | 2 +- .../swiftformer/test_modeling_swiftformer.py | 15 +----- .../test_modeling_timm_backbone.py | 4 ++ tests/test_modeling_common.py | 3 +- 37 files changed, 252 insertions(+), 192 deletions(-) diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py index 4bc5f442cb..455cab2f77 100644 --- a/src/transformers/models/align/modeling_align.py +++ b/src/transformers/models/align/modeling_align.py @@ -885,21 +885,22 @@ class AlignPreTrainedModel(PreTrainedModel): base_model_prefix = "align" supports_gradient_checkpointing = True - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): """Initialize the weights""" + std = self.config.initializer_range if isinstance(module, (nn.Linear, nn.Conv2d)): - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + module.weight.data.normal_(mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, AlignModel): nn.init.xavier_uniform_(module.text_projection.weight) module.text_projection.bias.data.zero_() - module.text_projection._is_hf_initialized = True + module.temperature.data.fill_(self.config.temperature_init_value) elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() - if isinstance(module, nn.LayerNorm): + if isinstance(module, (nn.LayerNorm, nn.BatchNorm2d)): module.bias.data.zero_() module.weight.data.fill_(1.0) diff --git a/src/transformers/models/autoformer/modeling_autoformer.py b/src/transformers/models/autoformer/modeling_autoformer.py index cdf7a7db05..a3a305e1e7 100644 --- a/src/transformers/models/autoformer/modeling_autoformer.py +++ b/src/transformers/models/autoformer/modeling_autoformer.py @@ -852,7 +852,7 @@ class AutoformerPreTrainedModel(PreTrainedModel): main_input_name = "past_values" supports_gradient_checkpointing = True - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): std = self.config.init_std if isinstance(module, (nn.Linear, nn.Conv1d)): module.weight.data.normal_(mean=0.0, std=std) @@ -864,6 +864,9 @@ class AutoformerPreTrainedModel(PreTrainedModel): module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_full_mask def _update_full_mask( diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py index 3b8313e31e..d82e913299 100644 --- a/src/transformers/models/bridgetower/modeling_bridgetower.py +++ b/src/transformers/models/bridgetower/modeling_bridgetower.py @@ -949,30 +949,30 @@ class BridgeTowerPreTrainedModel(PreTrainedModel): _no_split_modules = ["BridgeTowerSelfAttention", "BridgeTowerResidualAttention"] _skip_keys_device_placement = "past_key_values" - def _init_weights(self, module): - if isinstance(module, BridgeTowerVisionModel): - proj_std = (module.visual.transformer.hidden_size**-0.5) * ( - (2 * module.visual.transformer.num_hidden_layers) ** -0.5 - ) - attn_std = module.visual.transformer.hidden_size**-0.5 - fc_std = (2 * module.visual.transformer.hidden_size) ** -0.5 - for block in module.visual.transformer.resblocks: - nn.init.normal_(block.attn.in_proj_weight, std=attn_std * self.config.initializer_factor) - nn.init.normal_(block.attn.out_proj.weight, std=proj_std * self.config.initializer_factor) - nn.init.normal_(block.mlp.c_fc.weight, std=fc_std * self.config.initializer_factor) - nn.init.normal_(block.mlp.c_proj.weight, std=proj_std * self.config.initializer_factor) + def _init_weights(self, module: nn.Module): + std = self.config.initializer_factor + if isinstance(module, BridgeTowerVisionTransformer): + proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) + attn_std = self.config.hidden_size**-0.5 + fc_std = (2 * self.config.hidden_size) ** -0.5 + for block in module.transformer.resblocks: + nn.init.normal_(block.attn.in_proj_weight, std=attn_std * std) + block.attn.in_proj_bias.data.zero_() + nn.init.normal_(block.attn.out_proj.weight, std=proj_std * std) + nn.init.normal_(block.mlp.c_fc.weight, std=fc_std * std) + nn.init.normal_(block.mlp.c_proj.weight, std=proj_std * std) - nn.init.normal_(module.visual.embeddings.class_embedding, std=attn_std * self.config.initializer_factor) - nn.init.normal_( - module.visual.embeddings.position_embedding.weight, std=attn_std * self.config.initializer_factor - ) + nn.init.normal_(module.embeddings.class_embedding, std=attn_std * std) + nn.init.normal_(module.embeddings.position_embedding.weight, std=attn_std * std) elif isinstance(module, (nn.Linear, nn.Conv2d, nn.Embedding)): - module.weight.data.normal_(mean=0.0, std=0.05 * self.config.initializer_factor) + module.weight.data.normal_(mean=0.0, std=0.05 * std) elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) + elif isinstance(module, BridgeTowerForContrastiveLearning): + module.logit_scale.data.fill_(self.config.logit_scale_init_value) - if isinstance(module, nn.Linear) and module.bias is not None: + if isinstance(module, (nn.Linear, BridgeTowerMLMHead)) and module.bias is not None: module.bias.data.zero_() diff --git a/src/transformers/models/bros/modeling_bros.py b/src/transformers/models/bros/modeling_bros.py index 97a2f7fcf2..68fa2185ff 100755 --- a/src/transformers/models/bros/modeling_bros.py +++ b/src/transformers/models/bros/modeling_bros.py @@ -586,21 +586,24 @@ class BrosPreTrainedModel(PreTrainedModel): config: BrosConfig base_model_prefix = "bros" - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): """Initialize the weights""" + std = self.config.initializer_range if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + module.weight.data.normal_(mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) + elif isinstance(module, BrosRelationExtractor): + nn.init.normal_(module.dummy_node, std=std) @auto_docstring diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index 870aab4e2c..87b45900f4 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -1404,7 +1404,7 @@ class ClapPreTrainedModel(PreTrainedModel): base_model_prefix = "clap" supports_gradient_checkpointing = False - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): """Initialize the weights""" factor = self.config.initializer_factor @@ -1412,12 +1412,11 @@ class ClapPreTrainedModel(PreTrainedModel): module.position_embeddings.weight.data.normal_(mean=0.0, std=factor * 0.02) module.token_type_embeddings.weight.data.normal_(mean=0.0, std=factor * 0.02) elif isinstance(module, ClapModel): - nn.init.normal_(module.logit_scale_a, std=factor * 0.02) - nn.init.normal_(module.logit_scale_t, std=factor * 0.02) + module.logit_scale_a.data.fill_(math.log(self.config.logit_scale_init_value)) + module.logit_scale_t.data.fill_(math.log(self.config.logit_scale_init_value)) elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=factor * 0.02) - - elif isinstance(module, nn.LayerNorm): + elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d)): module.bias.data.zero_() module.weight.data.fill_(1.0) elif isinstance(module, (nn.Conv2d, nn.Linear)): @@ -1425,6 +1424,8 @@ class ClapPreTrainedModel(PreTrainedModel): nn.init.normal_(module.weight, std=in_proj_std) if module.bias is not None: module.bias.data.zero_() + elif isinstance(module, ClapAudioSelfAttention): + module.relative_position_bias_table.data.zero_() class ClapAudioModel(ClapPreTrainedModel): diff --git a/src/transformers/models/clvp/modeling_clvp.py b/src/transformers/models/clvp/modeling_clvp.py index bf55aa402b..c705997c20 100644 --- a/src/transformers/models/clvp/modeling_clvp.py +++ b/src/transformers/models/clvp/modeling_clvp.py @@ -788,7 +788,7 @@ class ClvpPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): """Initialize the weights""" factor = self.config.initializer_factor if isinstance(module, nn.Embedding): @@ -797,8 +797,9 @@ class ClvpPreTrainedModel(PreTrainedModel): module.weight.data.normal_(mean=0.0, std=factor * 0.02) if module.bias is not None: module.bias.data.zero_() + elif isinstance(module, ClvpRMSNorm): + module.weight.data.fill_(1.0) elif isinstance(module, ClvpEncoderMLP): - factor = self.config.initializer_factor in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor fc_std = (2 * module.config.hidden_size) ** -0.5 * factor nn.init.normal_(module.fc1.proj.weight if getattr(module.fc1, "proj") else module.fc1.weight, std=fc_std) @@ -816,7 +817,10 @@ class ClvpPreTrainedModel(PreTrainedModel): p.data.normal_( mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.num_hidden_layers)) ) - if isinstance(module, nn.LayerNorm): + elif isinstance(module, ClvpModelForConditionalGeneration): + module.logit_scale.data.fill_(self.config.logit_scale_init_value) + + if isinstance(module, (nn.LayerNorm, nn.GroupNorm)): module.bias.data.zero_() module.weight.data.fill_(1.0) diff --git a/src/transformers/models/efficientnet/modeling_efficientnet.py b/src/transformers/models/efficientnet/modeling_efficientnet.py index 814a2375ac..8f53227f69 100644 --- a/src/transformers/models/efficientnet/modeling_efficientnet.py +++ b/src/transformers/models/efficientnet/modeling_efficientnet.py @@ -437,17 +437,14 @@ class EfficientNetPreTrainedModel(PreTrainedModel): main_input_name = "pixel_values" _no_split_modules = [] - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): """Initialize the weights""" - if isinstance(module, (nn.Linear, nn.Conv2d)): + if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.bias is not None: module.bias.data.zero_() - elif isinstance(module, nn.LayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) @auto_docstring diff --git a/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py index 08039ef2d6..286c11b5e9 100644 --- a/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +++ b/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py @@ -1372,9 +1372,9 @@ class FastSpeech2ConformerHifiGan(PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): """Initialize the weights.""" - if isinstance(module, (nn.Linear, nn.Conv1d)): + if isinstance(module, (nn.Conv1d, nn.ConvTranspose1d)): module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.bias is not None: module.bias.data.zero_() diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py index d914b25531..ed4cda28e4 100644 --- a/src/transformers/models/informer/modeling_informer.py +++ b/src/transformers/models/informer/modeling_informer.py @@ -256,7 +256,7 @@ class InformerPreTrainedModel(PreTrainedModel): main_input_name = "past_values" supports_gradient_checkpointing = True - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): std = self.config.init_std if isinstance(module, (nn.Linear, nn.Conv1d)): module.weight.data.normal_(mean=0.0, std=std) @@ -268,6 +268,9 @@ class InformerPreTrainedModel(PreTrainedModel): module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_full_mask def _update_full_mask( diff --git a/src/transformers/models/informer/modular_informer.py b/src/transformers/models/informer/modular_informer.py index 4e306b1753..0b3ecb5936 100644 --- a/src/transformers/models/informer/modular_informer.py +++ b/src/transformers/models/informer/modular_informer.py @@ -97,7 +97,7 @@ class InformerPreTrainedModel(PreTrainedModel): main_input_name = "past_values" supports_gradient_checkpointing = True - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): std = self.config.init_std if isinstance(module, (nn.Linear, nn.Conv1d)): module.weight.data.normal_(mean=0.0, std=std) @@ -109,6 +109,9 @@ class InformerPreTrainedModel(PreTrainedModel): module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_full_mask def _update_full_mask( diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py index 2082d68f29..a6cf92bfd0 100644 --- a/src/transformers/models/kosmos2/modeling_kosmos2.py +++ b/src/transformers/models/kosmos2/modeling_kosmos2.py @@ -1156,7 +1156,7 @@ class Kosmos2PreTrainedModel(PreTrainedModel): _supports_flash_attn = True _supports_sdpa = True - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): """Initialize the weights""" if isinstance(self, Kosmos2VisionModel): factor = self.config.initializer_factor @@ -1179,65 +1179,34 @@ class Kosmos2PreTrainedModel(PreTrainedModel): nn.init.normal_(module.k_proj.weight, std=in_proj_std) nn.init.normal_(module.v_proj.weight, std=in_proj_std) nn.init.normal_(module.out_proj.weight, std=out_proj_std) - if module.q_proj.bias is not None: - module.q_proj.bias.data.zero_() - if module.k_proj.bias is not None: - module.k_proj.bias.data.zero_() - if module.v_proj.bias is not None: - module.v_proj.bias.data.zero_() - if module.out_proj.bias is not None: - module.out_proj.bias.data.zero_() elif isinstance(module, Kosmos2VisionMLP): in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor fc_std = (2 * module.config.hidden_size) ** -0.5 * factor nn.init.normal_(module.fc1.weight, std=fc_std) nn.init.normal_(module.fc2.weight, std=in_proj_std) - if module.fc1.bias is not None: - module.fc1.bias.data.zero_() - if module.fc2.bias is not None: - module.fc2.bias.data.zero_() - elif isinstance(module, Kosmos2VisionEncoderLayer): - module.layer_norm1.bias.data.zero_() - module.layer_norm1.weight.data.fill_(1.0) - module.layer_norm2.bias.data.zero_() - module.layer_norm2.weight.data.fill_(1.0) - elif isinstance(module, Kosmos2VisionTransformer): - module.pre_layrnorm.bias.data.zero_() - module.pre_layrnorm.weight.data.fill_(1.0) - module.post_layernorm.bias.data.zero_() - module.post_layernorm.weight.data.fill_(1.0) elif isinstance(module, KosmosTextAttention): nn.init.normal_(module.q_proj.weight, std=std) nn.init.normal_(module.k_proj.weight, std=std) nn.init.normal_(module.v_proj.weight, std=std) nn.init.normal_(module.out_proj.weight, std=std) - if module.q_proj.bias is not None: - module.q_proj.bias.data.zero_() - if module.k_proj.bias is not None: - module.k_proj.bias.data.zero_() - if module.v_proj.bias is not None: - module.v_proj.bias.data.zero_() - if module.out_proj.bias is not None: - module.out_proj.bias.data.zero_() elif isinstance(module, Kosmos2TextFFN): nn.init.normal_(module.fc1.weight, std=std) nn.init.normal_(module.fc2.weight, std=std) - if module.fc1.bias is not None: - module.fc1.bias.data.zero_() - if module.fc2.bias is not None: - module.fc2.bias.data.zero_() elif isinstance(module, Kosmos2TextForCausalLM): nn.init.normal_(module.lm_head.weight, std=std) - if module.lm_head.bias is not None: - module.lm_head.bias.data.zero_() elif isinstance(module, Kosmos2ImageToTextProjection): nn.init.normal_(module.dense.weight, std=std) - if module.dense.bias is not None: - module.dense.bias.data.zero_() + nn.init.normal_(module.latent_query) elif isinstance(module, Kosmos2TextTransformer): module.embed_tokens.weight.data.normal_(mean=0.0, std=std) if module.embed_tokens.padding_idx is not None: module.embed_tokens.weight.data[module.embed_tokens.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() class Kosmos2VisionModel(Kosmos2PreTrainedModel): diff --git a/src/transformers/models/mgp_str/modeling_mgp_str.py b/src/transformers/models/mgp_str/modeling_mgp_str.py index 27b9d0df9d..9e6ab26a4b 100644 --- a/src/transformers/models/mgp_str/modeling_mgp_str.py +++ b/src/transformers/models/mgp_str/modeling_mgp_str.py @@ -290,13 +290,14 @@ class MgpstrPreTrainedModel(PreTrainedModel): base_model_prefix = "mgp_str" _no_split_modules = [] - def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: + def _init_weights(self, module: nn.Module) -> None: """Initialize the weights""" + std = self.config.initializer_range if isinstance(module, MgpstrEmbeddings): - nn.init.trunc_normal_(module.pos_embed, mean=0.0, std=self.config.initializer_range) - nn.init.trunc_normal_(module.cls_token, mean=0.0, std=self.config.initializer_range) + nn.init.trunc_normal_(module.pos_embed, mean=0.0, std=std) + nn.init.trunc_normal_(module.cls_token, mean=0.0, std=std) elif isinstance(module, (nn.Linear, nn.Conv2d)): - module.weight.data = nn.init.trunc_normal_(module.weight.data, mean=0.0, std=self.config.initializer_range) + nn.init.trunc_normal_(module.weight.data, mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.LayerNorm): @@ -312,6 +313,9 @@ class MgpstrModel(MgpstrPreTrainedModel): self.embeddings = MgpstrEmbeddings(config) self.encoder = MgpstrEncoder(config) + # Initialize weights and apply final processing + self.post_init() + def get_input_embeddings(self) -> nn.Module: return self.embeddings.proj @@ -374,6 +378,9 @@ class MgpstrForSceneTextRecognition(MgpstrPreTrainedModel): self.bpe_head = nn.Linear(config.hidden_size, config.num_bpe_labels) self.wp_head = nn.Linear(config.hidden_size, config.num_wordpiece_labels) + # Initialize weights and apply final processing + self.post_init() + @auto_docstring def forward( self, diff --git a/src/transformers/models/mobilevit/modeling_mobilevit.py b/src/transformers/models/mobilevit/modeling_mobilevit.py index ce6c67e1ad..adfa133c51 100755 --- a/src/transformers/models/mobilevit/modeling_mobilevit.py +++ b/src/transformers/models/mobilevit/modeling_mobilevit.py @@ -629,9 +629,9 @@ class MobileViTPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _no_split_modules = ["MobileViTLayer"] - def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: + def _init_weights(self, module: nn.Module) -> None: """Initialize the weights""" - if isinstance(module, (nn.Linear, nn.Conv2d)): + if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) diff --git a/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py b/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py index f37f37c605..450c871ca9 100644 --- a/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py +++ b/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py @@ -569,7 +569,6 @@ class MobileViTV2Encoder(nn.Module): @auto_docstring -# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTPreTrainedModel with MobileViT->MobileViTV2,mobilevit->mobilevitv2 class MobileViTV2PreTrainedModel(PreTrainedModel): config: MobileViTV2Config base_model_prefix = "mobilevitv2" @@ -577,15 +576,15 @@ class MobileViTV2PreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _no_split_modules = ["MobileViTV2Layer"] - def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: + def _init_weights(self, module: nn.Module) -> None: """Initialize the weights""" - if isinstance(module, (nn.Linear, nn.Conv2d)): + if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.bias is not None: module.bias.data.zero_() - elif isinstance(module, nn.LayerNorm): + elif isinstance(module, nn.GroupNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) diff --git a/src/transformers/models/mra/modeling_mra.py b/src/transformers/models/mra/modeling_mra.py index 602cf53cc0..3a37712e85 100644 --- a/src/transformers/models/mra/modeling_mra.py +++ b/src/transformers/models/mra/modeling_mra.py @@ -821,21 +821,24 @@ class MraPreTrainedModel(PreTrainedModel): base_model_prefix = "mra" supports_gradient_checkpointing = True - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): """Initialize the weights""" + std = self.config.initializer_range if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + module.weight.data.normal_(mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) + elif isinstance(module, MraLMPredictionHead): + module.bias.data.zero_() @auto_docstring diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py index 5537d8c128..c8c927ed58 100644 --- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py +++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py @@ -850,7 +850,7 @@ class NllbMoePreTrainedModel(PreTrainedModel): _supports_sdpa = False _supports_flex_attn = False - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): """Initialize the weights""" std = self.config.init_std if isinstance(module, nn.Linear): @@ -861,6 +861,9 @@ class NllbMoePreTrainedModel(PreTrainedModel): module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() class NllbMoeEncoder(NllbMoePreTrainedModel): diff --git a/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py b/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py index 8256f070df..966be71d70 100644 --- a/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py +++ b/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py @@ -1012,11 +1012,11 @@ class OmDetTurboPreTrainedModel(PreTrainedModel): nn.init.xavier_uniform_(layer[0].weight) elif isinstance(module, OmDetTurboLanguageBackbone): nn.init.normal_(module.text_projection, std=self.config.text_projection_in_dim**-0.5) - elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): + elif isinstance(module, (nn.Linear, nn.Conv2d)): module.weight.data.normal_(mean=0.0, std=self.config.init_std) if module.bias is not None: module.bias.data.zero_() - elif isinstance(module, nn.LayerNorm): + elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d)): module.weight.data.fill_(1.0) module.bias.data.zero_() diff --git a/src/transformers/models/owlv2/modeling_owlv2.py b/src/transformers/models/owlv2/modeling_owlv2.py index 6294e58d69..e80292d98c 100644 --- a/src/transformers/models/owlv2/modeling_owlv2.py +++ b/src/transformers/models/owlv2/modeling_owlv2.py @@ -560,19 +560,17 @@ class Owlv2PreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _no_split_modules = ["Owlv2EncoderLayer"] - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): """Initialize the weights""" factor = self.config.initializer_factor if isinstance(module, Owlv2TextEmbeddings): module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02) module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02) elif isinstance(module, Owlv2VisionEmbeddings): - factor = self.config.initializer_factor nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor) nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor) nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor) elif isinstance(module, Owlv2Attention): - factor = self.config.initializer_factor in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor out_proj_std = (module.embed_dim**-0.5) * factor nn.init.normal_(module.q_proj.weight, std=in_proj_std) @@ -580,7 +578,6 @@ class Owlv2PreTrainedModel(PreTrainedModel): nn.init.normal_(module.v_proj.weight, std=in_proj_std) nn.init.normal_(module.out_proj.weight, std=out_proj_std) elif isinstance(module, Owlv2MLP): - factor = self.config.initializer_factor in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor fc_std = (2 * module.config.hidden_size) ** -0.5 * factor nn.init.normal_(module.fc1.weight, std=fc_std) @@ -588,17 +585,20 @@ class Owlv2PreTrainedModel(PreTrainedModel): elif isinstance(module, Owlv2Model): nn.init.normal_( module.text_projection.weight, - std=module.text_embed_dim**-0.5 * self.config.initializer_factor, + std=module.text_embed_dim**-0.5 * factor, ) nn.init.normal_( module.visual_projection.weight, - std=module.vision_embed_dim**-0.5 * self.config.initializer_factor, + std=module.vision_embed_dim**-0.5 * factor, ) + module.logit_scale.data.fill_(self.config.logit_scale_init_value) if isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=factor) + if module.bias is not None: + module.bias.data.zero_() # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTEncoder with OwlViT->Owlv2 @@ -1225,6 +1225,9 @@ class Owlv2ForObjectDetection(Owlv2PreTrainedModel): self.num_patches_width = self.config.vision_config.image_size // self.config.vision_config.patch_size self.box_bias = self.compute_box_bias(self.num_patches_height, self.num_patches_width) + # Initialize weights and apply final processing + self.post_init() + @staticmethod # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.normalize_grid_corner_coordinates def normalize_grid_corner_coordinates(num_patches_height: int, num_patches_width: int) -> torch.Tensor: diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py index da914d48b1..482be48ec1 100644 --- a/src/transformers/models/owlvit/modeling_owlvit.py +++ b/src/transformers/models/owlvit/modeling_owlvit.py @@ -547,19 +547,17 @@ class OwlViTPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _no_split_modules = ["OwlViTEncoderLayer"] - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): """Initialize the weights""" factor = self.config.initializer_factor if isinstance(module, OwlViTTextEmbeddings): module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02) module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02) elif isinstance(module, OwlViTVisionEmbeddings): - factor = self.config.initializer_factor nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor) nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor) nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor) elif isinstance(module, OwlViTAttention): - factor = self.config.initializer_factor in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor out_proj_std = (module.embed_dim**-0.5) * factor nn.init.normal_(module.q_proj.weight, std=in_proj_std) @@ -567,7 +565,6 @@ class OwlViTPreTrainedModel(PreTrainedModel): nn.init.normal_(module.v_proj.weight, std=in_proj_std) nn.init.normal_(module.out_proj.weight, std=out_proj_std) elif isinstance(module, OwlViTMLP): - factor = self.config.initializer_factor in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor fc_std = (2 * module.config.hidden_size) ** -0.5 * factor nn.init.normal_(module.fc1.weight, std=fc_std) @@ -575,17 +572,20 @@ class OwlViTPreTrainedModel(PreTrainedModel): elif isinstance(module, OwlViTModel): nn.init.normal_( module.text_projection.weight, - std=module.text_embed_dim**-0.5 * self.config.initializer_factor, + std=module.text_embed_dim**-0.5 * factor, ) nn.init.normal_( module.visual_projection.weight, - std=module.vision_embed_dim**-0.5 * self.config.initializer_factor, + std=module.vision_embed_dim**-0.5 * factor, ) + module.logit_scale.data.fill_(self.config.logit_scale_init_value) if isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=factor) + if module.bias is not None: + module.bias.data.zero_() class OwlViTEncoder(nn.Module): diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 559cf7df61..3f4b0c95e4 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -558,24 +558,28 @@ class PatchTSTPreTrainedModel(PreTrainedModel): main_input_name = "past_values" supports_gradient_checkpointing = False - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): """ Initialize weights """ if isinstance(module, PatchTSTPositionalEncoding): + # get the number of patches + num_patches = ( + max(self.config.context_length, self.config.patch_length) - self.config.patch_length + ) // self.config.patch_stride + 1 # initialize cls_token if self.config.use_cls_token: nn.init.normal_(module.cls_token, std=0.02) + num_patches += 1 # initialize positional encoding - if self.config.positional_encoding_type == "random": - nn.init.normal_(module.position_enc, mean=0.0, std=0.1) + module.position_enc = module._init_pe(self.config, num_patches) elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) elif isinstance(module, PatchTSTBatchNorm): module.batchnorm.bias.data.zero_() module.batchnorm.weight.data.fill_(1.0) - elif isinstance(module, (nn.Linear, nn.Conv1d)): + elif isinstance(module, nn.Linear): module.weight.data.normal_(mean=0.0, std=self.config.init_std) if module.bias is not None: module.bias.data.zero_() diff --git a/src/transformers/models/pvt/modeling_pvt.py b/src/transformers/models/pvt/modeling_pvt.py index 9517b0252e..9e2c5a69d8 100755 --- a/src/transformers/models/pvt/modeling_pvt.py +++ b/src/transformers/models/pvt/modeling_pvt.py @@ -447,12 +447,13 @@ class PvtPreTrainedModel(PreTrainedModel): main_input_name = "pixel_values" _no_split_modules = [] - def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: + def _init_weights(self, module: nn.Module) -> None: """Initialize the weights""" - if isinstance(module, nn.Linear): + std = self.config.initializer_range + if isinstance(module, (nn.Linear, nn.Conv2d)): # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid # `trunc_normal_cpu` not implemented in `half` issues - module.weight.data = nn.init.trunc_normal_(module.weight.data, mean=0.0, std=self.config.initializer_range) + nn.init.trunc_normal_(module.weight.data, mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.LayerNorm): @@ -462,13 +463,13 @@ class PvtPreTrainedModel(PreTrainedModel): module.position_embeddings.data = nn.init.trunc_normal_( module.position_embeddings.data, mean=0.0, - std=self.config.initializer_range, + std=std, ) if module.cls_token is not None: module.cls_token.data = nn.init.trunc_normal_( module.cls_token.data, mean=0.0, - std=self.config.initializer_range, + std=std, ) diff --git a/src/transformers/models/rwkv/modeling_rwkv.py b/src/transformers/models/rwkv/modeling_rwkv.py index cbc6fc81f0..0b16af2789 100644 --- a/src/transformers/models/rwkv/modeling_rwkv.py +++ b/src/transformers/models/rwkv/modeling_rwkv.py @@ -388,7 +388,7 @@ class RwkvPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _is_stateful = True - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): """Initialize the weights.""" if isinstance(module, RwkvSelfAttention): layer_id = module.layer_id @@ -420,13 +420,12 @@ class RwkvPreTrainedModel(PreTrainedModel): * 0.5 ) - with torch.no_grad(): - module.time_decay.data = decay_speed - module.time_first.data = torch.ones_like(module.time_first * math.log(0.3) + zigzag) + module.time_decay.data = decay_speed + module.time_first.data = torch.ones_like(module.time_first * math.log(0.3) + zigzag) - module.time_mix_key.data = torch.pow(time_weight, ratio_1_to_almost0) - module.time_mix_value.data = torch.pow(time_weight, ratio_1_to_almost0) + 0.3 * ratio_0_to_1 - module.time_mix_receptance.data = torch.pow(time_weight, 0.5 * ratio_1_to_almost0) + module.time_mix_key.data = torch.pow(time_weight, ratio_1_to_almost0) + module.time_mix_value.data = torch.pow(time_weight, ratio_1_to_almost0) + 0.3 * ratio_0_to_1 + module.time_mix_receptance.data = torch.pow(time_weight, 0.5 * ratio_1_to_almost0) elif isinstance(module, RwkvFeedForward): layer_id = module.layer_id num_hidden_layers = module.config.num_hidden_layers @@ -441,9 +440,28 @@ class RwkvPreTrainedModel(PreTrainedModel): ) time_weight = time_weight[None, None, :] - with torch.no_grad(): - module.time_mix_key.data = torch.pow(time_weight, ratio_1_to_almost0) - module.time_mix_receptance.data = torch.pow(time_weight, ratio_1_to_almost0) + module.time_mix_key.data = torch.pow(time_weight, ratio_1_to_almost0) + module.time_mix_receptance.data = torch.pow(time_weight, ratio_1_to_almost0) + elif isinstance(module, nn.Linear): + shape = module.weight.data.shape + gain = 1.0 + scale = 1.0 # extra scale for gain + if module.bias is not None: + module.bias.data.zero_() + if shape[0] > shape[1]: + gain = math.sqrt(shape[0] / shape[1]) + if shape[0] == self.config.vocab_size and shape[1] == self.config.hidden_size: # final projection? + scale = 0.5 + + gain *= scale + nn.init.orthogonal_(module.weight, gain=gain) + elif isinstance(module, nn.Embedding): + shape = module.weight.data.shape + gain = 1e-4 * math.sqrt(max(shape[0], shape[1])) + nn.init.orthogonal_(module.weight, gain=gain) + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() @dataclass diff --git a/src/transformers/models/sam/modeling_sam.py b/src/transformers/models/sam/modeling_sam.py index 8a607f237c..474e30c584 100644 --- a/src/transformers/models/sam/modeling_sam.py +++ b/src/transformers/models/sam/modeling_sam.py @@ -1016,7 +1016,7 @@ class SamPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _supports_sdpa = True - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): std = self.config.initializer_range if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)): module.weight.data.normal_(mean=0.0, std=std) @@ -1033,6 +1033,9 @@ class SamPreTrainedModel(PreTrainedModel): if module.use_rel_pos: module.rel_pos_h.data.zero_() module.rel_pos_w.data.zero_() + elif isinstance(module, SamVisionEncoder): + if self.config.use_abs_pos: + module.pos_embed.data.zero_() class SamVisionEncoder(SamPreTrainedModel): diff --git a/src/transformers/models/sam_hq/modeling_sam_hq.py b/src/transformers/models/sam_hq/modeling_sam_hq.py index 288d4134d2..fac2b15500 100644 --- a/src/transformers/models/sam_hq/modeling_sam_hq.py +++ b/src/transformers/models/sam_hq/modeling_sam_hq.py @@ -499,8 +499,8 @@ class SamHQPreTrainedModel(PreTrainedModel): if module.use_rel_pos: module.rel_pos_h.data.zero_() module.rel_pos_w.data.zero_() - if isinstance(module, SamHQVisionEncoder): - if module.pos_embed is not None: + elif isinstance(module, SamHQVisionEncoder): + if self.config.use_abs_pos: module.pos_embed.data.zero_() diff --git a/src/transformers/models/sam_hq/modular_sam_hq.py b/src/transformers/models/sam_hq/modular_sam_hq.py index 67399295c6..2a241fb2c0 100644 --- a/src/transformers/models/sam_hq/modular_sam_hq.py +++ b/src/transformers/models/sam_hq/modular_sam_hq.py @@ -188,9 +188,6 @@ class SamHQVisionLayer(SamVisionLayer): class SamHQPreTrainedModel(SamPreTrainedModel): def _init_weights(self, module): super()._init_weights(module) - if isinstance(module, SamHQVisionEncoder): - if module.pos_embed is not None: - module.pos_embed.data.zero_() class SamHQVisionEncoder(SamVisionEncoder, SamHQPreTrainedModel): diff --git a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py index 494e9000ca..6c1b827d8f 100755 --- a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py +++ b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py @@ -1343,7 +1343,7 @@ class SeamlessM4TPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _no_split_modules = ["SeamlessM4TEncoderLayer", "SeamlessM4TDecoderLayer", "SeamlessM4TConformerEncoderLayer"] - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): """Initialize the weights""" std = self.config.initializer_range if isinstance(module, nn.Linear): @@ -1370,7 +1370,7 @@ class SeamlessM4TPreTrainedModel(PreTrainedModel): k = math.sqrt(1 / module.projection.in_features) nn.init.uniform_(module.projection.weight, a=-k, b=k) nn.init.uniform_(module.projection.bias, a=-k, b=k) - elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)): + elif isinstance(module, (nn.LayerNorm, nn.BatchNorm1d)): module.bias.data.zero_() module.weight.data.fill_(1.0) elif isinstance(module, nn.Conv1d): @@ -2426,16 +2426,20 @@ class SeamlessM4TCodeHifiGan(PreTrainedModel): return hidden_states, lengths - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): """Initialize the weights.""" + std = self.config.initializer_range if isinstance(module, (nn.Linear, nn.Conv1d, nn.ConvTranspose1d)): - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + module.weight.data.normal_(mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() def apply_weight_norm(self): weight_norm = nn.utils.weight_norm @@ -2730,7 +2734,7 @@ class SeamlessM4TForTextToText(SeamlessM4TPreTrainedModel, GenerationMixin): """ ) class SeamlessM4TForSpeechToText(SeamlessM4TPreTrainedModel, GenerationMixin): - _keys_to_ignore_on_load_missing = ["text_decoder", "t2u_model", "vocoder"] + _keys_to_ignore_on_load_missing = ["text_encoder", "t2u_model", "vocoder"] main_input_name = "input_features" _tied_weights_keys = [ diff --git a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py index b920ced313..38b736d23e 100644 --- a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +++ b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py @@ -1260,7 +1260,7 @@ class SeamlessM4Tv2PreTrainedModel(PreTrainedModel): "SeamlessM4Tv2TextToUnitDecoderLayer", ] - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): """Initialize the weights""" std = self.config.initializer_range if isinstance(module, nn.Linear): @@ -1280,7 +1280,10 @@ class SeamlessM4Tv2PreTrainedModel(PreTrainedModel): k = math.sqrt(1 / module.projection.in_features) nn.init.uniform_(module.projection.weight, a=-k, b=k) nn.init.uniform_(module.projection.bias, a=-k, b=k) - elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)): + elif isinstance(module, SeamlessM4Tv2TextToUnitDecoder): + module.pos_emb_alpha_char.data.fill_(1) + module.pos_emb_alpha.data.fill_(1) + elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) elif isinstance(module, (nn.Conv1d, nn.ConvTranspose1d)): @@ -2636,16 +2639,20 @@ class SeamlessM4Tv2CodeHifiGan(PreTrainedModel): return hidden_states, lengths # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan._init_weights - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): """Initialize the weights.""" + std = self.config.initializer_range if isinstance(module, (nn.Linear, nn.Conv1d, nn.ConvTranspose1d)): - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + module.weight.data.normal_(mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan.apply_weight_norm def apply_weight_norm(self): @@ -2943,7 +2950,7 @@ class SeamlessM4Tv2ForTextToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin): """ ) class SeamlessM4Tv2ForSpeechToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin): - _keys_to_ignore_on_load_missing = ["text_decoder", "t2u_model", "vocoder"] + _keys_to_ignore_on_load_missing = ["text_encoder", "t2u_model", "vocoder"] main_input_name = "input_features" _tied_weights_keys = [ diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py index 370eb83fe3..4d6e5da03e 100644 --- a/src/transformers/models/speecht5/modeling_speecht5.py +++ b/src/transformers/models/speecht5/modeling_speecht5.py @@ -414,7 +414,7 @@ class SpeechT5ScaledPositionalEncoding(nn.Module): self.register_buffer("pe", pe, persistent=False) self.dropout = nn.Dropout(p=dropout) self.dim = dim - self.alpha = torch.nn.Parameter(torch.tensor(1.0)) + self.alpha = nn.Parameter(torch.tensor(1.0)) def forward(self, emb): emb = emb + self.alpha * self.pe[:, : emb.size(1)] @@ -1208,8 +1208,9 @@ class SpeechT5PreTrainedModel(PreTrainedModel): main_input_name = "input_values" supports_gradient_checkpointing = True - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): """Initialize the weights""" + std = self.config.initializer_range if isinstance(module, SpeechT5PositionalConvEmbedding): nn.init.normal_( module.conv.weight, @@ -1217,15 +1218,17 @@ class SpeechT5PreTrainedModel(PreTrainedModel): std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)), ) nn.init.constant_(module.conv.bias, 0) + elif isinstance(module, SpeechT5ScaledPositionalEncoding): + module.alpha.data.fill_(1.0) elif isinstance(module, SpeechT5FeatureProjection): k = math.sqrt(1 / module.projection.in_features) nn.init.uniform_(module.projection.weight, a=-k, b=k) nn.init.uniform_(module.projection.bias, a=-k, b=k) elif isinstance(module, nn.Linear): - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + module.weight.data.normal_(mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() - elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)): + elif isinstance(module, (nn.LayerNorm, nn.GroupNorm, nn.BatchNorm1d)): module.bias.data.zero_() module.weight.data.fill_(1.0) elif isinstance(module, nn.Conv1d): @@ -1234,10 +1237,13 @@ class SpeechT5PreTrainedModel(PreTrainedModel): k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0])) nn.init.uniform_(module.bias, a=-k, b=k) elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() + if hasattr(module, "masked_spec_embed"): + nn.init.uniform_(module.masked_spec_embed) + class SpeechT5Encoder(SpeechT5PreTrainedModel): """ @@ -3164,9 +3170,9 @@ class SpeechT5HifiGan(PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): """Initialize the weights.""" - if isinstance(module, (nn.Linear, nn.Conv1d)): + if isinstance(module, (nn.Conv1d, nn.ConvTranspose1d)): module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.bias is not None: module.bias.data.zero_() diff --git a/src/transformers/models/swiftformer/modeling_swiftformer.py b/src/transformers/models/swiftformer/modeling_swiftformer.py index acc7e8726d..4b6fac00a0 100644 --- a/src/transformers/models/swiftformer/modeling_swiftformer.py +++ b/src/transformers/models/swiftformer/modeling_swiftformer.py @@ -394,15 +394,23 @@ class SwiftFormerPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _no_split_modules = ["SwiftFormerEncoderBlock"] - def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: + def _init_weights(self, module: nn.Module) -> None: """Initialize the weights""" if isinstance(module, (nn.Conv2d, nn.Linear)): nn.init.trunc_normal_(module.weight, std=0.02) if module.bias is not None: nn.init.constant_(module.bias, 0) - elif isinstance(module, (nn.LayerNorm)): + elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d)): nn.init.constant_(module.bias, 0) nn.init.constant_(module.weight, 1.0) + elif isinstance(module, (SwiftFormerConvEncoder, SwiftFormerLocalRepresentation)): + module.layer_scale.data.fill_(1.0) + elif isinstance(module, SwiftFormerEncoderBlock): + if self.config.use_layer_scale: + module.layer_scale_1.data.fill_(self.config.layer_scale_init_value) + module.layer_scale_2.data.fill_(self.config.layer_scale_init_value) + elif isinstance(module, SwiftFormerEfficientAdditiveAttention): + nn.init.normal_(module.w_g) @auto_docstring diff --git a/src/transformers/models/tvp/modeling_tvp.py b/src/transformers/models/tvp/modeling_tvp.py index 41ef6c3d38..77d74bffe0 100644 --- a/src/transformers/models/tvp/modeling_tvp.py +++ b/src/transformers/models/tvp/modeling_tvp.py @@ -555,7 +555,7 @@ class TvpPreTrainedModel(PreTrainedModel): base_model_prefix = "model" supports_gradient_checkpointing = True - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): """Initialize the weights""" if isinstance(module, (nn.Linear, nn.Embedding)): # Slightly different from the TF version which uses truncated_normal for initialization @@ -564,14 +564,23 @@ class TvpPreTrainedModel(PreTrainedModel): elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) - - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() - - if isinstance(module, nn.Conv2d): + elif isinstance(module, nn.Conv2d): nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu") if module.bias is not None: nn.init.constant_(module.bias, 0) + elif isinstance(module, TvpModel): + nn.init.normal_(module.text_prompt) + + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + if hasattr(module, "pad_up"): + nn.init.normal_(module.pad_up) + if hasattr(module, "pad_down"): + nn.init.normal_(module.pad_down) + if hasattr(module, "pad_left"): + nn.init.normal_(module.pad_left) + if hasattr(module, "pad_right"): + nn.init.normal_(module.pad_right) class TvpFrameDownPadPrompter(nn.Module): diff --git a/src/transformers/models/vitmatte/modeling_vitmatte.py b/src/transformers/models/vitmatte/modeling_vitmatte.py index e91732fe74..50cec9c153 100644 --- a/src/transformers/models/vitmatte/modeling_vitmatte.py +++ b/src/transformers/models/vitmatte/modeling_vitmatte.py @@ -57,8 +57,8 @@ class VitMattePreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _no_split_modules = [] - def _init_weights(self, module): - if isinstance(module, nn.Conv2d): + def _init_weights(self, module: nn.Module): + if isinstance(module, (nn.Conv2d, nn.BatchNorm2d)): module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.bias is not None: module.bias.data.zero_() diff --git a/src/transformers/models/vits/modeling_vits.py b/src/transformers/models/vits/modeling_vits.py index e7492960bd..6accac596a 100644 --- a/src/transformers/models/vits/modeling_vits.py +++ b/src/transformers/models/vits/modeling_vits.py @@ -1218,24 +1218,33 @@ class VitsPreTrainedModel(PreTrainedModel): main_input_name = "input_ids" supports_gradient_checkpointing = True - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): """Initialize the weights""" + std = self.config.initializer_range if isinstance(module, nn.Linear): - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + module.weight.data.normal_(mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) - elif isinstance(module, nn.Conv1d): + elif isinstance(module, (nn.Conv1d, nn.ConvTranspose1d)): nn.init.kaiming_normal_(module.weight) if module.bias is not None: k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0])) nn.init.uniform_(module.bias, a=-k, b=k) elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() + elif isinstance(module, VitsAttention): + if self.config.window_size: + head_dim = self.config.hidden_size // self.config.num_attention_heads + nn.init.normal_(module.emb_rel_k, std=head_dim**-0.5) + nn.init.normal_(module.emb_rel_v, std=head_dim**-0.5) + elif isinstance(module, VitsElementwiseAffine): + module.translate.data.zero_() + module.log_scale.data.zero_() @auto_docstring( diff --git a/src/transformers/models/yoso/modeling_yoso.py b/src/transformers/models/yoso/modeling_yoso.py index 442f776a5e..221ebaa637 100644 --- a/src/transformers/models/yoso/modeling_yoso.py +++ b/src/transformers/models/yoso/modeling_yoso.py @@ -643,21 +643,24 @@ class YosoPreTrainedModel(PreTrainedModel): base_model_prefix = "yoso" supports_gradient_checkpointing = True - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): """Initialize the weights""" + std = self.config.initializer_range if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + module.weight.data.normal_(mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) + elif isinstance(module, YosoLMPredictionHead): + module.bias.data.zero_() @auto_docstring diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py index e828a54827..24a28e80bd 100644 --- a/tests/models/clap/test_modeling_clap.py +++ b/tests/models/clap/test_modeling_clap.py @@ -536,7 +536,7 @@ class ClapModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): for name, param in model.named_parameters(): if param.requires_grad: # check if `logit_scale` is initialized as per the original implementation - if name == "logit_scale": + if "logit_scale" in name: self.assertAlmostEqual( param.data.item(), np.log(1 / 0.07), diff --git a/tests/models/swiftformer/test_modeling_swiftformer.py b/tests/models/swiftformer/test_modeling_swiftformer.py index 7302c47127..5b5736f83d 100644 --- a/tests/models/swiftformer/test_modeling_swiftformer.py +++ b/tests/models/swiftformer/test_modeling_swiftformer.py @@ -13,10 +13,9 @@ # limitations under the License. """Testing suite for the PyTorch SwiftFormer model.""" -import copy import unittest -from transformers import PretrainedConfig, SwiftFormerConfig +from transformers import SwiftFormerConfig from transformers.testing_utils import ( require_torch, require_vision, @@ -26,7 +25,7 @@ from transformers.testing_utils import ( from transformers.utils import cached_property, is_torch_available, is_vision_available from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -234,16 +233,6 @@ class SwiftFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC check_hidden_states_output(inputs_dict, config, model_class) def test_initialization(self): - def _config_zero_init(config): - configs_no_init = copy.deepcopy(config) - for key in configs_no_init.__dict__.keys(): - if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key: - setattr(configs_no_init, key, 1e-10) - if isinstance(getattr(configs_no_init, key, None), PretrainedConfig): - no_init_subconfig = _config_zero_init(getattr(configs_no_init, key)) - setattr(configs_no_init, key, no_init_subconfig) - return configs_no_init - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() configs_no_init = _config_zero_init(config) diff --git a/tests/models/timm_backbone/test_modeling_timm_backbone.py b/tests/models/timm_backbone/test_modeling_timm_backbone.py index e915233a38..306b9d2b06 100644 --- a/tests/models/timm_backbone/test_modeling_timm_backbone.py +++ b/tests/models/timm_backbone/test_modeling_timm_backbone.py @@ -136,6 +136,10 @@ class TimmBackboneModelTest(ModelTesterMixin, BackboneTesterMixin, PipelineTeste def test_hidden_states_output(self): pass + @unittest.skip(reason="TimmBackbone initialization is managed on the timm side") + def test_can_init_all_missing_weights(self): + pass + @unittest.skip(reason="TimmBackbone initialization is managed on the timm side") def test_initialization(self): pass diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 62641ad356..5a24bcecee 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -854,9 +854,8 @@ class ModelTesterMixin: for model_class in self.all_model_classes: # For now, skip everything older than 2024 and "important models" (too much models to patch otherwise) - # Use `supports_cache_class` as a proxy to judge "important" models in order to prioritize them # TODO: relax this as we patch more and more models - if addition_year < 2024: + if addition_year < 2023: self.skipTest(reason=f"{model_class} is not a priorited model for now.") # Monkey patch the method to add a seed (we do it on PreTrainedModel._initialize_weights, which wraps