From e355c0a11c927d9e8f22409559c0fae76ccc598c Mon Sep 17 00:00:00 2001 From: BUI Van Tuan <37981884+bvantuan@users.noreply.github.com> Date: Wed, 2 Jul 2025 15:03:57 +0200 Subject: [PATCH] Fix missing initializations for models created in 2024 (#38987) * fix GroundingDino * fix SuperGlue * fix GroundingDino * fix MambaModel * fix OmDetTurbo * fix SegGpt * fix Qwen2Audio * fix Mamba2 * fix DabDetr * fix Dac * fix FalconMamba * skip timm initialization * fix Encodec and MusicgenMelody * fix Musicgen * skip timm initialization test * fix OmDetTurbo * clean the code Co-authored-by: Cyril Vallez * add reviewed changes * add back timm * style * better check for parametrizations --------- Co-authored-by: Cyril Vallez --- .../models/dab_detr/modeling_dab_detr.py | 5 ++ src/transformers/models/dac/modeling_dac.py | 6 +++ .../models/encodec/modeling_encodec.py | 22 +++----- .../falcon_mamba/modeling_falcon_mamba.py | 53 ++++++++++++------- .../grounding_dino/modeling_grounding_dino.py | 10 ++-- .../models/mamba/modeling_mamba.py | 53 ++++++++++++------- .../models/mamba2/modeling_mamba2.py | 52 +++++++++++------- .../models/musicgen/modeling_musicgen.py | 5 +- .../modeling_musicgen_melody.py | 7 ++- .../omdet_turbo/modeling_omdet_turbo.py | 5 ++ .../qwen2_audio/modeling_qwen2_audio.py | 3 ++ .../models/seggpt/modeling_seggpt.py | 4 +- .../models/superglue/modeling_superglue.py | 9 ++-- tests/models/encodec/test_modeling_encodec.py | 7 +-- .../test_modeling_falcon_mamba.py | 19 ++++++- .../test_modeling_grounding_dino.py | 2 + tests/models/mamba/test_modeling_mamba.py | 19 ++++++- tests/models/mamba2/test_modeling_mamba2.py | 30 +++++++++-- .../omdet_turbo/test_modeling_omdet_turbo.py | 1 + .../test_modeling_timm_wrapper.py | 8 +++ tests/test_modeling_common.py | 7 ++- 21 files changed, 229 insertions(+), 98 deletions(-) diff --git a/src/transformers/models/dab_detr/modeling_dab_detr.py b/src/transformers/models/dab_detr/modeling_dab_detr.py index 0f88a06fe6..119a7a0b16 100644 --- a/src/transformers/models/dab_detr/modeling_dab_detr.py +++ b/src/transformers/models/dab_detr/modeling_dab_detr.py @@ -829,6 +829,9 @@ class DabDetrPreTrainedModel(PreTrainedModel): module.weight.data.normal_(mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: @@ -841,6 +844,8 @@ class DabDetrPreTrainedModel(PreTrainedModel): prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1) bias_value = -math.log((1 - prior_prob) / prior_prob) module.class_embed.bias.data.fill_(bias_value) + elif isinstance(module, nn.PReLU): + module.reset_parameters() # Modified from transformers.models.detr.modeling_detr.DetrEncoder with Detr->DabDetr,DETR->ConditionalDETR diff --git a/src/transformers/models/dac/modeling_dac.py b/src/transformers/models/dac/modeling_dac.py index 191e7af89e..398d258bef 100644 --- a/src/transformers/models/dac/modeling_dac.py +++ b/src/transformers/models/dac/modeling_dac.py @@ -480,6 +480,12 @@ class DacPreTrainedModel(PreTrainedAudioTokenizerBase): if isinstance(module, nn.Conv1d): nn.init.trunc_normal_(module.weight, std=0.02) nn.init.constant_(module.bias, 0) + elif isinstance(module, Snake1d): + module.alpha.data.fill_(1.0) + elif isinstance(module, nn.ConvTranspose1d): + module.reset_parameters() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=0.02) def apply_weight_norm(self): weight_norm = nn.utils.weight_norm diff --git a/src/transformers/models/encodec/modeling_encodec.py b/src/transformers/models/encodec/modeling_encodec.py index a74315ab4c..6e610ba295 100644 --- a/src/transformers/models/encodec/modeling_encodec.py +++ b/src/transformers/models/encodec/modeling_encodec.py @@ -235,7 +235,7 @@ class EncodecLSTM(nn.Module): LSTM without worrying about the hidden state, nor the layout of the data. Expects input as convolutional layout. """ - def __init__(self, config, dimension): + def __init__(self, config: EncodecConfig, dimension: int): super().__init__() self.lstm = nn.LSTM(dimension, dimension, config.num_lstm_layers) @@ -452,11 +452,7 @@ class EncodecPreTrainedModel(PreTrainedModel): def _init_weights(self, module): """Initialize the weights""" - if isinstance(module, nn.Linear): - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)): + if isinstance(module, nn.GroupNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) elif isinstance(module, nn.Conv1d): @@ -464,10 +460,8 @@ class EncodecPreTrainedModel(PreTrainedModel): if module.bias is not None: k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0])) nn.init.uniform_(module.bias, a=-k, b=k) - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.ConvTranspose1d): + module.reset_parameters() elif isinstance(module, nn.LSTM): for name, param in module.named_parameters(): if "weight" in name: @@ -659,7 +653,7 @@ class EncodecModel(EncodecPreTrainedModel): def decode( self, - audio_codes: torch.Tensor, + audio_codes: torch.LongTensor, audio_scales: torch.Tensor, padding_mask: Optional[torch.Tensor] = None, return_dict: Optional[bool] = None, @@ -708,10 +702,10 @@ class EncodecModel(EncodecPreTrainedModel): @auto_docstring def forward( self, - input_values: torch.Tensor, - padding_mask: Optional[torch.Tensor] = None, + input_values: torch.FloatTensor, + padding_mask: Optional[torch.BoolTensor] = None, bandwidth: Optional[float] = None, - audio_codes: Optional[torch.Tensor] = None, + audio_codes: Optional[torch.LongTensor] = None, audio_scales: Optional[torch.Tensor] = None, return_dict: Optional[bool] = None, ) -> Union[tuple[torch.Tensor, torch.Tensor], EncodecOutput]: diff --git a/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py b/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py index 426e557d9d..942053be3e 100644 --- a/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py +++ b/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py @@ -445,9 +445,16 @@ class FalconMambaPreTrainedModel(PreTrainedModel): def _init_weights(self, module): """Initialize the weights.""" + std = self.config.initializer_range if isinstance(module, FalconMambaMixer): + # S4D real initialization. These are not discretized! + # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded + A = torch.arange(1, module.ssm_state_size + 1, dtype=torch.float32)[None, :] + A = A.expand(module.intermediate_size, -1).contiguous() + module.A_log.copy_(torch.log(A)) module.A_log._no_weight_decay = True module.D._no_weight_decay = True + module.D.data.fill_(1.0) dt_init_std = self.config.time_step_rank**-0.5 * self.config.time_step_scale if self.config.time_step_init_scheme == "constant": @@ -462,33 +469,39 @@ class FalconMambaPreTrainedModel(PreTrainedModel): ).clamp(min=self.config.time_step_floor) # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759 inv_dt = dt + torch.log(-torch.expm1(-dt)) - with torch.no_grad(): - module.dt_proj.bias.copy_(inv_dt) + module.dt_proj.bias.copy_(inv_dt) module.dt_proj.bias._no_reinit = True + nn.init.kaiming_uniform_(module.conv1d.weight, a=math.sqrt(5)) + if module.conv1d.bias is not None: + if not getattr(module.conv1d.bias, "_no_reinit", False): + nn.init.zeros_(module.conv1d.bias) + nn.init.kaiming_uniform_(module.out_proj.weight, a=math.sqrt(5)) + + if self.config.rescale_prenorm_residual: + # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: + # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale + # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. + # > -- GPT-2 :: https://openai.com/blog/better-language-models/ + # + # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py + # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block + # Following Pytorch init, except scale by 1/sqrt(2 * n_layer) + # We need to reinit p since this code could be called multiple times + # Having just p *= scale would repeatedly scale it down + p = module.out_proj.weight + p /= math.sqrt(self.config.num_hidden_layers) + if isinstance(module, nn.Linear): + if not getattr(module.weight, "_no_reinit", False): + nn.init.normal_(module.weight, std=std) if module.bias is not None: if not getattr(module.bias, "_no_reinit", False): nn.init.zeros_(module.bias) + elif isinstance(module, FalconMambaRMSNorm): + module.weight.data.fill_(1.0) elif isinstance(module, nn.Embedding): - nn.init.normal_(module.weight, std=self.config.initializer_range) - - if self.config.rescale_prenorm_residual: - # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: - # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale - # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. - # > -- GPT-2 :: https://openai.com/blog/better-language-models/ - # - # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py - for name, p in module.named_parameters(): - if name in ["out_proj.weight"]: - # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block - # Following Pytorch init, except scale by 1/sqrt(2 * n_layer) - # We need to reinit p since this code could be called multiple times - # Having just p *= scale would repeatedly scale it down - nn.init.kaiming_uniform_(p, a=math.sqrt(5)) - with torch.no_grad(): - p /= math.sqrt(self.config.num_hidden_layers) + nn.init.normal_(module.weight, std=std) @dataclass diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 31ccb4becd..743f74a121 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1414,16 +1414,18 @@ class GroundingDinoPreTrainedModel(PreTrainedModel): module.out_vision_proj.bias.data.fill_(0) nn.init.xavier_uniform_(module.out_text_proj.weight) module.out_text_proj.bias.data.fill_(0) - elif isinstance(module, (GroundingDinoEncoderLayer, GroundingDinoDecoderLayer)): - for p in module.parameters(): - if p.dim() > 1: - nn.init.normal_(p, mean=0.0, std=std) + elif isinstance(module, GroundingDinoFusionLayer): + module.vision_param.data.fill_(1e-4) + module.text_param.data.fill_(1e-4) elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() + elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)): + module.weight.data.fill_(1.0) + module.bias.data.zero_() elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: diff --git a/src/transformers/models/mamba/modeling_mamba.py b/src/transformers/models/mamba/modeling_mamba.py index f2347833db..7da4ef5787 100644 --- a/src/transformers/models/mamba/modeling_mamba.py +++ b/src/transformers/models/mamba/modeling_mamba.py @@ -382,9 +382,16 @@ class MambaPreTrainedModel(PreTrainedModel): def _init_weights(self, module): """Initialize the weights.""" + std = self.config.initializer_range if isinstance(module, MambaMixer): + # S4D real initialization. These are not discretized! + # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded + A = torch.arange(1, module.ssm_state_size + 1, dtype=torch.float32)[None, :] + A = A.expand(module.intermediate_size, -1).contiguous() + module.A_log.copy_(torch.log(A)) module.A_log._no_weight_decay = True module.D._no_weight_decay = True + module.D.data.fill_(1.0) dt_init_std = self.config.time_step_rank**-0.5 * self.config.time_step_scale if self.config.time_step_init_scheme == "constant": @@ -399,33 +406,39 @@ class MambaPreTrainedModel(PreTrainedModel): ).clamp(min=self.config.time_step_floor) # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759 inv_dt = dt + torch.log(-torch.expm1(-dt)) - with torch.no_grad(): - module.dt_proj.bias.copy_(inv_dt) + module.dt_proj.bias.copy_(inv_dt) module.dt_proj.bias._no_reinit = True + nn.init.kaiming_uniform_(module.conv1d.weight, a=math.sqrt(5)) + if module.conv1d.bias is not None: + if not getattr(module.conv1d.bias, "_no_reinit", False): + nn.init.zeros_(module.conv1d.bias) + nn.init.kaiming_uniform_(module.out_proj.weight, a=math.sqrt(5)) + + if self.config.rescale_prenorm_residual: + # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: + # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale + # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. + # > -- GPT-2 :: https://openai.com/blog/better-language-models/ + # + # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py + # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block + # Following Pytorch init, except scale by 1/sqrt(2 * n_layer) + # We need to reinit p since this code could be called multiple times + # Having just p *= scale would repeatedly scale it down + p = module.out_proj.weight + p /= math.sqrt(self.config.num_hidden_layers) + if isinstance(module, nn.Linear): + if not getattr(module.weight, "_no_reinit", False): + nn.init.normal_(module.weight, std=std) if module.bias is not None: if not getattr(module.bias, "_no_reinit", False): nn.init.zeros_(module.bias) + elif isinstance(module, MambaRMSNorm): + module.weight.data.fill_(1.0) elif isinstance(module, nn.Embedding): - nn.init.normal_(module.weight, std=self.config.initializer_range) - - if self.config.rescale_prenorm_residual: - # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: - # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale - # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. - # > -- GPT-2 :: https://openai.com/blog/better-language-models/ - # - # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py - for name, p in module.named_parameters(): - if name in ["out_proj.weight"]: - # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block - # Following Pytorch init, except scale by 1/sqrt(2 * n_layer) - # We need to reinit p since this code could be called multiple times - # Having just p *= scale would repeatedly scale it down - nn.init.kaiming_uniform_(p, a=math.sqrt(5)) - with torch.no_grad(): - p /= math.sqrt(self.config.num_hidden_layers) + nn.init.normal_(module.weight, std=std) @dataclass diff --git a/src/transformers/models/mamba2/modeling_mamba2.py b/src/transformers/models/mamba2/modeling_mamba2.py index 1f663462d5..e601b4d8a6 100644 --- a/src/transformers/models/mamba2/modeling_mamba2.py +++ b/src/transformers/models/mamba2/modeling_mamba2.py @@ -721,9 +721,15 @@ class Mamba2PreTrainedModel(PreTrainedModel): def _init_weights(self, module): """Initialize the weights.""" + std = self.config.initializer_range if isinstance(module, Mamba2Mixer): + # S4D real initialization. These are not discretized! + # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded + A = torch.arange(1, self.config.num_heads + 1) + module.A_log.copy_(torch.log(A)) module.A_log._no_weight_decay = True module.D._no_weight_decay = True + module.D.data.fill_(1.0) dt = torch.exp( torch.rand(self.config.num_heads) @@ -733,33 +739,39 @@ class Mamba2PreTrainedModel(PreTrainedModel): # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759 inv_dt = dt + torch.log(-torch.expm1(-dt)) - with torch.no_grad(): - module.dt_bias.copy_(inv_dt) + module.dt_bias.copy_(inv_dt) module.dt_bias._no_reinit = True + nn.init.kaiming_uniform_(module.conv1d.weight, a=math.sqrt(5)) + if module.conv1d.bias is not None: + if not getattr(module.conv1d.bias, "_no_reinit", False): + nn.init.zeros_(module.conv1d.bias) + nn.init.kaiming_uniform_(module.out_proj.weight, a=math.sqrt(5)) + + if self.config.rescale_prenorm_residual: + # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: + # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale + # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. + # > -- GPT-2 :: https://openai.com/blog/better-language-models/ + # + # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py + # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block + # Following Pytorch init, except scale by 1/sqrt(2 * n_layer) + # We need to reinit p since this code could be called multiple times + # Having just p *= scale would repeatedly scale it down + p = module.out_proj.weight + p /= math.sqrt(self.config.num_hidden_layers) + if isinstance(module, nn.Linear): + if not getattr(module.weight, "_no_reinit", False): + nn.init.normal_(module.weight, std=std) if module.bias is not None: if not getattr(module.bias, "_no_reinit", False): nn.init.zeros_(module.bias) + elif isinstance(module, (Mamba2RMSNorm, MambaRMSNormGated)): + module.weight.data.fill_(1.0) elif isinstance(module, nn.Embedding): - nn.init.normal_(module.weight, std=self.config.initializer_range) - - if self.config.rescale_prenorm_residual: - # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: - # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale - # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. - # > -- GPT-2 :: https://openai.com/blog/better-language-models/ - # - # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py - for name, p in module.named_parameters(): - if name in ["out_proj.weight"]: - # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block - # Following Pytorch init, except scale by 1/sqrt(2 * n_layer) - # We need to reinit p since this code could be called multiple times - # Having just p *= scale would repeatedly scale it down - nn.init.kaiming_uniform_(p, a=math.sqrt(5)) - with torch.no_grad(): - p /= math.sqrt(self.config.num_hidden_layers) + nn.init.normal_(module.weight, std=std) @dataclass diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py index 8a0cb7dbf8..139256c7c7 100644 --- a/src/transformers/models/musicgen/modeling_musicgen.py +++ b/src/transformers/models/musicgen/modeling_musicgen.py @@ -440,10 +440,13 @@ class MusicgenPreTrainedModel(PreTrainedModel): def _init_weights(self, module): std = self.config.initializer_factor - if isinstance(module, (nn.Linear, nn.Conv1d)): + if isinstance(module, nn.Linear): module.weight.data.normal_(mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py index e8aa032784..55e28ca58f 100644 --- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py +++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py @@ -406,10 +406,13 @@ class MusicgenMelodyPreTrainedModel(PreTrainedModel): def _init_weights(self, module): std = self.config.initializer_factor - if isinstance(module, (nn.Linear, nn.Conv1d)): + if isinstance(module, nn.Linear): module.weight.data.normal_(mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: @@ -1286,7 +1289,7 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel, GenerationMixin): The text encoder model that encodes text into hidden states for conditioning. audio_encoder (`PreTrainedModel`, *optional*): The audio encoder model that encodes audio into hidden states for conditioning. - decoder (`MusicgenForCausalLM`, *optional*): + decoder (`MusicgenMelodyForCausalLM`, *optional*): The decoder model that generates audio tokens based on conditioning signals. """ if config is None and None in (text_encoder, audio_encoder, decoder): diff --git a/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py b/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py index 0dfbf83332..9bac40553d 100644 --- a/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py +++ b/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py @@ -1006,10 +1006,15 @@ class OmDetTurboPreTrainedModel(PreTrainedModel): nn.init.xavier_uniform_(module.query_position_head.layers[1].weight) for layer in module.channel_projection_layers: nn.init.xavier_uniform_(layer[0].weight) + elif isinstance(module, OmDetTurboLanguageBackbone): + nn.init.normal_(module.text_projection, std=self.config.text_projection_in_dim**-0.5) elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): module.weight.data.normal_(mean=0.0, std=self.config.init_std) if module.bias is not None: module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, OmDetTurboDecoder): diff --git a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py index 45fcbe8049..f90f7ff9cf 100644 --- a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py +++ b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py @@ -283,6 +283,9 @@ class Qwen2AudioPreTrainedModel(PreTrainedModel): module.weight.data.normal_(mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: diff --git a/src/transformers/models/seggpt/modeling_seggpt.py b/src/transformers/models/seggpt/modeling_seggpt.py index 80a51fb556..364483359e 100644 --- a/src/transformers/models/seggpt/modeling_seggpt.py +++ b/src/transformers/models/seggpt/modeling_seggpt.py @@ -604,7 +604,7 @@ class SegGptPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _no_split_modules = ["SegGptEmbeddings", "SegGptLayer"] - def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: + def _init_weights(self, module: nn.Module) -> None: """Initialize the weights""" std = self.config.initializer_range if isinstance(module, (nn.Linear, nn.Conv2d)): @@ -615,7 +615,7 @@ class SegGptPreTrainedModel(PreTrainedModel): ) if module.bias is not None: module.bias.data.zero_() - elif isinstance(module, nn.LayerNorm): + elif isinstance(module, (nn.LayerNorm, SegGptLayerNorm)): module.bias.data.zero_() module.weight.data.fill_(1.0) elif isinstance(module, SegGptAttention): diff --git a/src/transformers/models/superglue/modeling_superglue.py b/src/transformers/models/superglue/modeling_superglue.py index 33e50de7aa..ce92e7b66b 100644 --- a/src/transformers/models/superglue/modeling_superglue.py +++ b/src/transformers/models/superglue/modeling_superglue.py @@ -551,17 +551,18 @@ class SuperGluePreTrainedModel(PreTrainedModel): def _init_weights(self, module: nn.Module) -> None: """Initialize the weights""" - if isinstance(module, (nn.Linear, nn.Conv2d, nn.Conv1d)): + if isinstance(module, (nn.Linear, nn.Conv2d)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.bias is not None: module.bias.data.zero_() - elif isinstance(module, nn.LayerNorm): + elif isinstance(module, nn.BatchNorm1d): module.bias.data.zero_() module.weight.data.fill_(1.0) - elif isinstance(module, SuperGlueMultiLayerPerceptron): - nn.init.constant_(module.linear.bias, 0.0) + + if hasattr(module, "bin_score"): + module.bin_score.data.fill_(1.0) @auto_docstring( diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py index 21e9ac1040..a429561b71 100644 --- a/tests/models/encodec/test_modeling_encodec.py +++ b/tests/models/encodec/test_modeling_encodec.py @@ -310,12 +310,13 @@ class EncodecModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase) def test_feed_forward_chunking(self): (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common() + # original_config.norm_type = "time_group_norm" for model_class in self.all_model_classes: torch.manual_seed(0) config = copy.deepcopy(original_config) config.chunk_length_s = None config.overlap = None - config.sampling_rate = 10 + config.sampling_rate = 20 model = model_class(config) model.to(torch_device) @@ -326,9 +327,9 @@ class EncodecModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase) hidden_states_no_chunk = model(**inputs)[1] torch.manual_seed(0) - config.chunk_length_s = 1 + config.chunk_length_s = 2 config.overlap = 0 - config.sampling_rate = 10 + config.sampling_rate = 20 model = model_class(config) model.to(torch_device) diff --git a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py index e59787fb8c..cada419ea0 100644 --- a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py +++ b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py @@ -33,7 +33,7 @@ from transformers.testing_utils import ( from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, ids_tensor +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -359,9 +359,11 @@ class FalconMambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTest def test_initialization(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() + config.rescale_prenorm_residual = True + configs_no_init = _config_zero_init(config) for model_class in self.all_model_classes: - model = model_class(config=config) + model = model_class(config=configs_no_init) for name, param in model.named_parameters(): if "dt_proj.bias" in name: dt = torch.exp( @@ -380,6 +382,19 @@ class FalconMambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTest if param.requires_grad: # check if it's a ones like torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5) + else: + if param.requires_grad: + if ( + "mixer.conv1d.weight" in name + or "mixer.dt_proj.weight" in name + or "mixer.out_proj.weight" in name + ): + continue + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) @slow # Ignore copy diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py index 2afe3f0ef3..d632f99e2c 100644 --- a/tests/models/grounding_dino/test_modeling_grounding_dino.py +++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py @@ -586,6 +586,8 @@ class GroundingDinoModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Tes or "value_proj" in name or "output_proj" in name or "reference_points" in name + or "vision_proj" in name + or "text_proj" in name ): continue self.assertIn( diff --git a/tests/models/mamba/test_modeling_mamba.py b/tests/models/mamba/test_modeling_mamba.py index 840493648f..b570d1a130 100644 --- a/tests/models/mamba/test_modeling_mamba.py +++ b/tests/models/mamba/test_modeling_mamba.py @@ -24,7 +24,7 @@ from transformers.testing_utils import require_torch, require_torch_multi_gpu, s from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, ids_tensor +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -326,9 +326,11 @@ class MambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi def test_initialization(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() + config.rescale_prenorm_residual = True + configs_no_init = _config_zero_init(config) for model_class in self.all_model_classes: - model = model_class(config=config) + model = model_class(config=configs_no_init) for name, param in model.named_parameters(): if "dt_proj.bias" in name: dt = torch.exp( @@ -347,6 +349,19 @@ class MambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi if param.requires_grad: # check if it's a ones like torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5) + else: + if param.requires_grad: + if ( + "mixer.conv1d.weight" in name + or "mixer.dt_proj.weight" in name + or "mixer.out_proj.weight" in name + ): + continue + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) @slow def test_model_from_pretrained(self): diff --git a/tests/models/mamba2/test_modeling_mamba2.py b/tests/models/mamba2/test_modeling_mamba2.py index dfa8bca69e..c9cec231e6 100644 --- a/tests/models/mamba2/test_modeling_mamba2.py +++ b/tests/models/mamba2/test_modeling_mamba2.py @@ -13,6 +13,7 @@ # limitations under the License. +import math import unittest from transformers import AutoTokenizer, Mamba2Config, is_torch_available @@ -28,7 +29,7 @@ from transformers.utils.import_utils import is_causal_conv1d_available, is_mamba from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, ids_tensor +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -276,14 +277,37 @@ class Mamba2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix def test_initialization(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() + config.rescale_prenorm_residual = True + configs_no_init = _config_zero_init(config) for model_class in self.all_model_classes: - model = model_class(config=config) + model = model_class(config=configs_no_init) for name, param in model.named_parameters(): - if "D" in name: + if "dt_proj.bias" in name: + dt = torch.exp( + torch.tensor([0, 1]) * (math.log(config.time_step_max) - math.log(config.time_step_min)) + + math.log(config.time_step_min) + ).clamp(min=config.time_step_floor) + inv_dt = dt + torch.log(-torch.expm1(-dt)) + if param.requires_grad: + self.assertTrue(param.data.max().item() <= inv_dt[1]) + self.assertTrue(param.data.min().item() >= inv_dt[0]) + elif "A_log" in name: + A = torch.arange(1, config.num_heads + 1) + torch.testing.assert_close(param.data, torch.log(A), rtol=1e-5, atol=1e-5) + elif "D" in name: if param.requires_grad: # check if it's a ones like torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5) + else: + if param.requires_grad: + if "mixer.conv1d.weight" in name or "mixer.dt_bias" in name or "mixer.out_proj.weight" in name: + continue + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) @unittest.skip(reason="Mamba 2 weights are not tied") def test_tied_weights_keys(self): diff --git a/tests/models/omdet_turbo/test_modeling_omdet_turbo.py b/tests/models/omdet_turbo/test_modeling_omdet_turbo.py index 11568f66f4..9d76ad392c 100644 --- a/tests/models/omdet_turbo/test_modeling_omdet_turbo.py +++ b/tests/models/omdet_turbo/test_modeling_omdet_turbo.py @@ -629,6 +629,7 @@ class OmDetTurboModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa or "decoder.channel_projection_layers" in name or "query_position_head" in name or "decoder.encoder_vision_features" in name + or "language_backbone.text_projection" in name ): continue self.assertIn( diff --git a/tests/models/timm_wrapper/test_modeling_timm_wrapper.py b/tests/models/timm_wrapper/test_modeling_timm_wrapper.py index f7f374ed57..3f103309a0 100644 --- a/tests/models/timm_wrapper/test_modeling_timm_wrapper.py +++ b/tests/models/timm_wrapper/test_modeling_timm_wrapper.py @@ -153,10 +153,18 @@ class TimmWrapperModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC def test_retain_grad_hidden_states_attentions(self): pass + @unittest.skip(reason="TimmWrapper initialization is managed on the timm side") + def test_can_init_all_missing_weights(self): + pass + @unittest.skip(reason="TimmWrapper initialization is managed on the timm side") def test_initialization(self): pass + @unittest.skip(reason="TimmWrapper initialization is managed on the timm side") + def test_mismatched_shapes_have_properly_initialized_weights(self): + pass + @unittest.skip(reason="Need to use a timm model and there is no tiny model available.") def test_model_is_small(self): pass diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 0587c73bd9..da48081d6b 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -855,7 +855,7 @@ class ModelTesterMixin: # For now, skip everything older than 2025 and "important models" (too much models to patch otherwise) # Use `supports_cache_class` as a proxy to judge "important" models in order to prioritize them # TODO: relax this as we patch more and more models - if addition_year < 2025 and not model_class._supports_cache_class: + if addition_year < 2024 and not model_class._supports_cache_class: self.skipTest(reason=f"{model_class} is not a priorited model for now.") # Monkey patch the method to add a seed (we do it on PreTrainedModel._initialize_weights, which wraps @@ -895,6 +895,11 @@ class ModelTesterMixin: model_from_config.state_dict().items(), model_from_pretrained.state_dict().items() ): self.assertEqual(k1, k2, "The keys from each model should be the same") + + # In case using torch.nn.utils.parametrizations on a module, we should skip the resulting keys + if re.search(r"\.parametrizations\..*?\.original[01]", k1): + continue + # Since we added the seed, they should be exactly the same (i.e. using allclose maybe be wrong due # to very low std in init function) if not (v1 == v2).all():