Fix missing initializations for models created in 2024 (#38987)

* fix GroundingDino

* fix SuperGlue

* fix GroundingDino

* fix MambaModel

* fix OmDetTurbo

* fix SegGpt

* fix Qwen2Audio

* fix Mamba2

* fix DabDetr

* fix Dac

* fix FalconMamba

* skip timm initialization

* fix Encodec and MusicgenMelody

* fix Musicgen

* skip timm initialization test

* fix OmDetTurbo

* clean the code

Co-authored-by: Cyril Vallez <cyril.vallez@gmail.com>

* add reviewed changes

* add back timm

* style

* better check for parametrizations

---------

Co-authored-by: Cyril Vallez <cyril.vallez@gmail.com>
This commit is contained in:
BUI Van Tuan
2025-07-02 15:03:57 +02:00
committed by GitHub
parent 1125513a8d
commit e355c0a11c
21 changed files with 229 additions and 98 deletions

View File

@@ -829,6 +829,9 @@ class DabDetrPreTrainedModel(PreTrainedModel):
module.weight.data.normal_(mean=0.0, std=std) module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None: if module.bias is not None:
module.bias.data.zero_() module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm):
module.weight.data.fill_(1.0)
module.bias.data.zero_()
elif isinstance(module, nn.Embedding): elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std) module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None: if module.padding_idx is not None:
@@ -841,6 +844,8 @@ class DabDetrPreTrainedModel(PreTrainedModel):
prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1) prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1)
bias_value = -math.log((1 - prior_prob) / prior_prob) bias_value = -math.log((1 - prior_prob) / prior_prob)
module.class_embed.bias.data.fill_(bias_value) module.class_embed.bias.data.fill_(bias_value)
elif isinstance(module, nn.PReLU):
module.reset_parameters()
# Modified from transformers.models.detr.modeling_detr.DetrEncoder with Detr->DabDetr,DETR->ConditionalDETR # Modified from transformers.models.detr.modeling_detr.DetrEncoder with Detr->DabDetr,DETR->ConditionalDETR

View File

@@ -480,6 +480,12 @@ class DacPreTrainedModel(PreTrainedAudioTokenizerBase):
if isinstance(module, nn.Conv1d): if isinstance(module, nn.Conv1d):
nn.init.trunc_normal_(module.weight, std=0.02) nn.init.trunc_normal_(module.weight, std=0.02)
nn.init.constant_(module.bias, 0) nn.init.constant_(module.bias, 0)
elif isinstance(module, Snake1d):
module.alpha.data.fill_(1.0)
elif isinstance(module, nn.ConvTranspose1d):
module.reset_parameters()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=0.02)
def apply_weight_norm(self): def apply_weight_norm(self):
weight_norm = nn.utils.weight_norm weight_norm = nn.utils.weight_norm

View File

@@ -235,7 +235,7 @@ class EncodecLSTM(nn.Module):
LSTM without worrying about the hidden state, nor the layout of the data. Expects input as convolutional layout. LSTM without worrying about the hidden state, nor the layout of the data. Expects input as convolutional layout.
""" """
def __init__(self, config, dimension): def __init__(self, config: EncodecConfig, dimension: int):
super().__init__() super().__init__()
self.lstm = nn.LSTM(dimension, dimension, config.num_lstm_layers) self.lstm = nn.LSTM(dimension, dimension, config.num_lstm_layers)
@@ -452,11 +452,7 @@ class EncodecPreTrainedModel(PreTrainedModel):
def _init_weights(self, module): def _init_weights(self, module):
"""Initialize the weights""" """Initialize the weights"""
if isinstance(module, nn.Linear): if isinstance(module, nn.GroupNorm):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
module.bias.data.zero_() module.bias.data.zero_()
module.weight.data.fill_(1.0) module.weight.data.fill_(1.0)
elif isinstance(module, nn.Conv1d): elif isinstance(module, nn.Conv1d):
@@ -464,10 +460,8 @@ class EncodecPreTrainedModel(PreTrainedModel):
if module.bias is not None: if module.bias is not None:
k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0])) k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
nn.init.uniform_(module.bias, a=-k, b=k) nn.init.uniform_(module.bias, a=-k, b=k)
elif isinstance(module, nn.Embedding): elif isinstance(module, nn.ConvTranspose1d):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) module.reset_parameters()
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LSTM): elif isinstance(module, nn.LSTM):
for name, param in module.named_parameters(): for name, param in module.named_parameters():
if "weight" in name: if "weight" in name:
@@ -659,7 +653,7 @@ class EncodecModel(EncodecPreTrainedModel):
def decode( def decode(
self, self,
audio_codes: torch.Tensor, audio_codes: torch.LongTensor,
audio_scales: torch.Tensor, audio_scales: torch.Tensor,
padding_mask: Optional[torch.Tensor] = None, padding_mask: Optional[torch.Tensor] = None,
return_dict: Optional[bool] = None, return_dict: Optional[bool] = None,
@@ -708,10 +702,10 @@ class EncodecModel(EncodecPreTrainedModel):
@auto_docstring @auto_docstring
def forward( def forward(
self, self,
input_values: torch.Tensor, input_values: torch.FloatTensor,
padding_mask: Optional[torch.Tensor] = None, padding_mask: Optional[torch.BoolTensor] = None,
bandwidth: Optional[float] = None, bandwidth: Optional[float] = None,
audio_codes: Optional[torch.Tensor] = None, audio_codes: Optional[torch.LongTensor] = None,
audio_scales: Optional[torch.Tensor] = None, audio_scales: Optional[torch.Tensor] = None,
return_dict: Optional[bool] = None, return_dict: Optional[bool] = None,
) -> Union[tuple[torch.Tensor, torch.Tensor], EncodecOutput]: ) -> Union[tuple[torch.Tensor, torch.Tensor], EncodecOutput]:

View File

@@ -445,9 +445,16 @@ class FalconMambaPreTrainedModel(PreTrainedModel):
def _init_weights(self, module): def _init_weights(self, module):
"""Initialize the weights.""" """Initialize the weights."""
std = self.config.initializer_range
if isinstance(module, FalconMambaMixer): if isinstance(module, FalconMambaMixer):
# S4D real initialization. These are not discretized!
# The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
A = torch.arange(1, module.ssm_state_size + 1, dtype=torch.float32)[None, :]
A = A.expand(module.intermediate_size, -1).contiguous()
module.A_log.copy_(torch.log(A))
module.A_log._no_weight_decay = True module.A_log._no_weight_decay = True
module.D._no_weight_decay = True module.D._no_weight_decay = True
module.D.data.fill_(1.0)
dt_init_std = self.config.time_step_rank**-0.5 * self.config.time_step_scale dt_init_std = self.config.time_step_rank**-0.5 * self.config.time_step_scale
if self.config.time_step_init_scheme == "constant": if self.config.time_step_init_scheme == "constant":
@@ -462,16 +469,14 @@ class FalconMambaPreTrainedModel(PreTrainedModel):
).clamp(min=self.config.time_step_floor) ).clamp(min=self.config.time_step_floor)
# # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759 # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
inv_dt = dt + torch.log(-torch.expm1(-dt)) inv_dt = dt + torch.log(-torch.expm1(-dt))
with torch.no_grad():
module.dt_proj.bias.copy_(inv_dt) module.dt_proj.bias.copy_(inv_dt)
module.dt_proj.bias._no_reinit = True module.dt_proj.bias._no_reinit = True
if isinstance(module, nn.Linear): nn.init.kaiming_uniform_(module.conv1d.weight, a=math.sqrt(5))
if module.bias is not None: if module.conv1d.bias is not None:
if not getattr(module.bias, "_no_reinit", False): if not getattr(module.conv1d.bias, "_no_reinit", False):
nn.init.zeros_(module.bias) nn.init.zeros_(module.conv1d.bias)
elif isinstance(module, nn.Embedding): nn.init.kaiming_uniform_(module.out_proj.weight, a=math.sqrt(5))
nn.init.normal_(module.weight, std=self.config.initializer_range)
if self.config.rescale_prenorm_residual: if self.config.rescale_prenorm_residual:
# Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
@@ -480,16 +485,24 @@ class FalconMambaPreTrainedModel(PreTrainedModel):
# > -- GPT-2 :: https://openai.com/blog/better-language-models/ # > -- GPT-2 :: https://openai.com/blog/better-language-models/
# #
# Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
for name, p in module.named_parameters():
if name in ["out_proj.weight"]:
# Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
# Following Pytorch init, except scale by 1/sqrt(2 * n_layer) # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
# We need to reinit p since this code could be called multiple times # We need to reinit p since this code could be called multiple times
# Having just p *= scale would repeatedly scale it down # Having just p *= scale would repeatedly scale it down
nn.init.kaiming_uniform_(p, a=math.sqrt(5)) p = module.out_proj.weight
with torch.no_grad():
p /= math.sqrt(self.config.num_hidden_layers) p /= math.sqrt(self.config.num_hidden_layers)
if isinstance(module, nn.Linear):
if not getattr(module.weight, "_no_reinit", False):
nn.init.normal_(module.weight, std=std)
if module.bias is not None:
if not getattr(module.bias, "_no_reinit", False):
nn.init.zeros_(module.bias)
elif isinstance(module, FalconMambaRMSNorm):
module.weight.data.fill_(1.0)
elif isinstance(module, nn.Embedding):
nn.init.normal_(module.weight, std=std)
@dataclass @dataclass
@auto_docstring( @auto_docstring(

View File

@@ -1414,16 +1414,18 @@ class GroundingDinoPreTrainedModel(PreTrainedModel):
module.out_vision_proj.bias.data.fill_(0) module.out_vision_proj.bias.data.fill_(0)
nn.init.xavier_uniform_(module.out_text_proj.weight) nn.init.xavier_uniform_(module.out_text_proj.weight)
module.out_text_proj.bias.data.fill_(0) module.out_text_proj.bias.data.fill_(0)
elif isinstance(module, (GroundingDinoEncoderLayer, GroundingDinoDecoderLayer)): elif isinstance(module, GroundingDinoFusionLayer):
for p in module.parameters(): module.vision_param.data.fill_(1e-4)
if p.dim() > 1: module.text_param.data.fill_(1e-4)
nn.init.normal_(p, mean=0.0, std=std)
elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
# Slightly different from the TF version which uses truncated_normal for initialization # Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617 # cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=std) module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None: if module.bias is not None:
module.bias.data.zero_() module.bias.data.zero_()
elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
module.weight.data.fill_(1.0)
module.bias.data.zero_()
elif isinstance(module, nn.Embedding): elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std) module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None: if module.padding_idx is not None:

View File

@@ -382,9 +382,16 @@ class MambaPreTrainedModel(PreTrainedModel):
def _init_weights(self, module): def _init_weights(self, module):
"""Initialize the weights.""" """Initialize the weights."""
std = self.config.initializer_range
if isinstance(module, MambaMixer): if isinstance(module, MambaMixer):
# S4D real initialization. These are not discretized!
# The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
A = torch.arange(1, module.ssm_state_size + 1, dtype=torch.float32)[None, :]
A = A.expand(module.intermediate_size, -1).contiguous()
module.A_log.copy_(torch.log(A))
module.A_log._no_weight_decay = True module.A_log._no_weight_decay = True
module.D._no_weight_decay = True module.D._no_weight_decay = True
module.D.data.fill_(1.0)
dt_init_std = self.config.time_step_rank**-0.5 * self.config.time_step_scale dt_init_std = self.config.time_step_rank**-0.5 * self.config.time_step_scale
if self.config.time_step_init_scheme == "constant": if self.config.time_step_init_scheme == "constant":
@@ -399,16 +406,14 @@ class MambaPreTrainedModel(PreTrainedModel):
).clamp(min=self.config.time_step_floor) ).clamp(min=self.config.time_step_floor)
# # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759 # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
inv_dt = dt + torch.log(-torch.expm1(-dt)) inv_dt = dt + torch.log(-torch.expm1(-dt))
with torch.no_grad():
module.dt_proj.bias.copy_(inv_dt) module.dt_proj.bias.copy_(inv_dt)
module.dt_proj.bias._no_reinit = True module.dt_proj.bias._no_reinit = True
if isinstance(module, nn.Linear): nn.init.kaiming_uniform_(module.conv1d.weight, a=math.sqrt(5))
if module.bias is not None: if module.conv1d.bias is not None:
if not getattr(module.bias, "_no_reinit", False): if not getattr(module.conv1d.bias, "_no_reinit", False):
nn.init.zeros_(module.bias) nn.init.zeros_(module.conv1d.bias)
elif isinstance(module, nn.Embedding): nn.init.kaiming_uniform_(module.out_proj.weight, a=math.sqrt(5))
nn.init.normal_(module.weight, std=self.config.initializer_range)
if self.config.rescale_prenorm_residual: if self.config.rescale_prenorm_residual:
# Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
@@ -417,16 +422,24 @@ class MambaPreTrainedModel(PreTrainedModel):
# > -- GPT-2 :: https://openai.com/blog/better-language-models/ # > -- GPT-2 :: https://openai.com/blog/better-language-models/
# #
# Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
for name, p in module.named_parameters():
if name in ["out_proj.weight"]:
# Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
# Following Pytorch init, except scale by 1/sqrt(2 * n_layer) # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
# We need to reinit p since this code could be called multiple times # We need to reinit p since this code could be called multiple times
# Having just p *= scale would repeatedly scale it down # Having just p *= scale would repeatedly scale it down
nn.init.kaiming_uniform_(p, a=math.sqrt(5)) p = module.out_proj.weight
with torch.no_grad():
p /= math.sqrt(self.config.num_hidden_layers) p /= math.sqrt(self.config.num_hidden_layers)
if isinstance(module, nn.Linear):
if not getattr(module.weight, "_no_reinit", False):
nn.init.normal_(module.weight, std=std)
if module.bias is not None:
if not getattr(module.bias, "_no_reinit", False):
nn.init.zeros_(module.bias)
elif isinstance(module, MambaRMSNorm):
module.weight.data.fill_(1.0)
elif isinstance(module, nn.Embedding):
nn.init.normal_(module.weight, std=std)
@dataclass @dataclass
@auto_docstring( @auto_docstring(

View File

@@ -721,9 +721,15 @@ class Mamba2PreTrainedModel(PreTrainedModel):
def _init_weights(self, module): def _init_weights(self, module):
"""Initialize the weights.""" """Initialize the weights."""
std = self.config.initializer_range
if isinstance(module, Mamba2Mixer): if isinstance(module, Mamba2Mixer):
# S4D real initialization. These are not discretized!
# The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
A = torch.arange(1, self.config.num_heads + 1)
module.A_log.copy_(torch.log(A))
module.A_log._no_weight_decay = True module.A_log._no_weight_decay = True
module.D._no_weight_decay = True module.D._no_weight_decay = True
module.D.data.fill_(1.0)
dt = torch.exp( dt = torch.exp(
torch.rand(self.config.num_heads) torch.rand(self.config.num_heads)
@@ -733,16 +739,14 @@ class Mamba2PreTrainedModel(PreTrainedModel):
# # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759 # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
inv_dt = dt + torch.log(-torch.expm1(-dt)) inv_dt = dt + torch.log(-torch.expm1(-dt))
with torch.no_grad():
module.dt_bias.copy_(inv_dt) module.dt_bias.copy_(inv_dt)
module.dt_bias._no_reinit = True module.dt_bias._no_reinit = True
if isinstance(module, nn.Linear): nn.init.kaiming_uniform_(module.conv1d.weight, a=math.sqrt(5))
if module.bias is not None: if module.conv1d.bias is not None:
if not getattr(module.bias, "_no_reinit", False): if not getattr(module.conv1d.bias, "_no_reinit", False):
nn.init.zeros_(module.bias) nn.init.zeros_(module.conv1d.bias)
elif isinstance(module, nn.Embedding): nn.init.kaiming_uniform_(module.out_proj.weight, a=math.sqrt(5))
nn.init.normal_(module.weight, std=self.config.initializer_range)
if self.config.rescale_prenorm_residual: if self.config.rescale_prenorm_residual:
# Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
@@ -751,16 +755,24 @@ class Mamba2PreTrainedModel(PreTrainedModel):
# > -- GPT-2 :: https://openai.com/blog/better-language-models/ # > -- GPT-2 :: https://openai.com/blog/better-language-models/
# #
# Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
for name, p in module.named_parameters():
if name in ["out_proj.weight"]:
# Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
# Following Pytorch init, except scale by 1/sqrt(2 * n_layer) # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
# We need to reinit p since this code could be called multiple times # We need to reinit p since this code could be called multiple times
# Having just p *= scale would repeatedly scale it down # Having just p *= scale would repeatedly scale it down
nn.init.kaiming_uniform_(p, a=math.sqrt(5)) p = module.out_proj.weight
with torch.no_grad():
p /= math.sqrt(self.config.num_hidden_layers) p /= math.sqrt(self.config.num_hidden_layers)
if isinstance(module, nn.Linear):
if not getattr(module.weight, "_no_reinit", False):
nn.init.normal_(module.weight, std=std)
if module.bias is not None:
if not getattr(module.bias, "_no_reinit", False):
nn.init.zeros_(module.bias)
elif isinstance(module, (Mamba2RMSNorm, MambaRMSNormGated)):
module.weight.data.fill_(1.0)
elif isinstance(module, nn.Embedding):
nn.init.normal_(module.weight, std=std)
@dataclass @dataclass
@auto_docstring( @auto_docstring(

View File

@@ -440,10 +440,13 @@ class MusicgenPreTrainedModel(PreTrainedModel):
def _init_weights(self, module): def _init_weights(self, module):
std = self.config.initializer_factor std = self.config.initializer_factor
if isinstance(module, (nn.Linear, nn.Conv1d)): if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std) module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None: if module.bias is not None:
module.bias.data.zero_() module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm):
module.weight.data.fill_(1.0)
module.bias.data.zero_()
elif isinstance(module, nn.Embedding): elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std) module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None: if module.padding_idx is not None:

View File

@@ -406,10 +406,13 @@ class MusicgenMelodyPreTrainedModel(PreTrainedModel):
def _init_weights(self, module): def _init_weights(self, module):
std = self.config.initializer_factor std = self.config.initializer_factor
if isinstance(module, (nn.Linear, nn.Conv1d)): if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std) module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None: if module.bias is not None:
module.bias.data.zero_() module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm):
module.weight.data.fill_(1.0)
module.bias.data.zero_()
elif isinstance(module, nn.Embedding): elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std) module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None: if module.padding_idx is not None:
@@ -1286,7 +1289,7 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel, GenerationMixin):
The text encoder model that encodes text into hidden states for conditioning. The text encoder model that encodes text into hidden states for conditioning.
audio_encoder (`PreTrainedModel`, *optional*): audio_encoder (`PreTrainedModel`, *optional*):
The audio encoder model that encodes audio into hidden states for conditioning. The audio encoder model that encodes audio into hidden states for conditioning.
decoder (`MusicgenForCausalLM`, *optional*): decoder (`MusicgenMelodyForCausalLM`, *optional*):
The decoder model that generates audio tokens based on conditioning signals. The decoder model that generates audio tokens based on conditioning signals.
""" """
if config is None and None in (text_encoder, audio_encoder, decoder): if config is None and None in (text_encoder, audio_encoder, decoder):

View File

@@ -1006,10 +1006,15 @@ class OmDetTurboPreTrainedModel(PreTrainedModel):
nn.init.xavier_uniform_(module.query_position_head.layers[1].weight) nn.init.xavier_uniform_(module.query_position_head.layers[1].weight)
for layer in module.channel_projection_layers: for layer in module.channel_projection_layers:
nn.init.xavier_uniform_(layer[0].weight) nn.init.xavier_uniform_(layer[0].weight)
elif isinstance(module, OmDetTurboLanguageBackbone):
nn.init.normal_(module.text_projection, std=self.config.text_projection_in_dim**-0.5)
elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
module.weight.data.normal_(mean=0.0, std=self.config.init_std) module.weight.data.normal_(mean=0.0, std=self.config.init_std)
if module.bias is not None: if module.bias is not None:
module.bias.data.zero_() module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm):
module.weight.data.fill_(1.0)
module.bias.data.zero_()
def _set_gradient_checkpointing(self, module, value=False): def _set_gradient_checkpointing(self, module, value=False):
if isinstance(module, OmDetTurboDecoder): if isinstance(module, OmDetTurboDecoder):

View File

@@ -283,6 +283,9 @@ class Qwen2AudioPreTrainedModel(PreTrainedModel):
module.weight.data.normal_(mean=0.0, std=std) module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None: if module.bias is not None:
module.bias.data.zero_() module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm):
module.weight.data.fill_(1.0)
module.bias.data.zero_()
elif isinstance(module, nn.Embedding): elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std) module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None: if module.padding_idx is not None:

View File

@@ -604,7 +604,7 @@ class SegGptPreTrainedModel(PreTrainedModel):
supports_gradient_checkpointing = True supports_gradient_checkpointing = True
_no_split_modules = ["SegGptEmbeddings", "SegGptLayer"] _no_split_modules = ["SegGptEmbeddings", "SegGptLayer"]
def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: def _init_weights(self, module: nn.Module) -> None:
"""Initialize the weights""" """Initialize the weights"""
std = self.config.initializer_range std = self.config.initializer_range
if isinstance(module, (nn.Linear, nn.Conv2d)): if isinstance(module, (nn.Linear, nn.Conv2d)):
@@ -615,7 +615,7 @@ class SegGptPreTrainedModel(PreTrainedModel):
) )
if module.bias is not None: if module.bias is not None:
module.bias.data.zero_() module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm): elif isinstance(module, (nn.LayerNorm, SegGptLayerNorm)):
module.bias.data.zero_() module.bias.data.zero_()
module.weight.data.fill_(1.0) module.weight.data.fill_(1.0)
elif isinstance(module, SegGptAttention): elif isinstance(module, SegGptAttention):

View File

@@ -551,17 +551,18 @@ class SuperGluePreTrainedModel(PreTrainedModel):
def _init_weights(self, module: nn.Module) -> None: def _init_weights(self, module: nn.Module) -> None:
"""Initialize the weights""" """Initialize the weights"""
if isinstance(module, (nn.Linear, nn.Conv2d, nn.Conv1d)): if isinstance(module, (nn.Linear, nn.Conv2d)):
# Slightly different from the TF version which uses truncated_normal for initialization # Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617 # cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None: if module.bias is not None:
module.bias.data.zero_() module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm): elif isinstance(module, nn.BatchNorm1d):
module.bias.data.zero_() module.bias.data.zero_()
module.weight.data.fill_(1.0) module.weight.data.fill_(1.0)
elif isinstance(module, SuperGlueMultiLayerPerceptron):
nn.init.constant_(module.linear.bias, 0.0) if hasattr(module, "bin_score"):
module.bin_score.data.fill_(1.0)
@auto_docstring( @auto_docstring(

View File

@@ -310,12 +310,13 @@ class EncodecModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
def test_feed_forward_chunking(self): def test_feed_forward_chunking(self):
(original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common() (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
# original_config.norm_type = "time_group_norm"
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
torch.manual_seed(0) torch.manual_seed(0)
config = copy.deepcopy(original_config) config = copy.deepcopy(original_config)
config.chunk_length_s = None config.chunk_length_s = None
config.overlap = None config.overlap = None
config.sampling_rate = 10 config.sampling_rate = 20
model = model_class(config) model = model_class(config)
model.to(torch_device) model.to(torch_device)
@@ -326,9 +327,9 @@ class EncodecModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
hidden_states_no_chunk = model(**inputs)[1] hidden_states_no_chunk = model(**inputs)[1]
torch.manual_seed(0) torch.manual_seed(0)
config.chunk_length_s = 1 config.chunk_length_s = 2
config.overlap = 0 config.overlap = 0
config.sampling_rate = 10 config.sampling_rate = 20
model = model_class(config) model = model_class(config)
model.to(torch_device) model.to(torch_device)

View File

@@ -33,7 +33,7 @@ from transformers.testing_utils import (
from ...generation.test_utils import GenerationTesterMixin from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, ids_tensor from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
from ...test_pipeline_mixin import PipelineTesterMixin from ...test_pipeline_mixin import PipelineTesterMixin
@@ -359,9 +359,11 @@ class FalconMambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTest
def test_initialization(self): def test_initialization(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common() config, _ = self.model_tester.prepare_config_and_inputs_for_common()
config.rescale_prenorm_residual = True
configs_no_init = _config_zero_init(config)
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
model = model_class(config=config) model = model_class(config=configs_no_init)
for name, param in model.named_parameters(): for name, param in model.named_parameters():
if "dt_proj.bias" in name: if "dt_proj.bias" in name:
dt = torch.exp( dt = torch.exp(
@@ -380,6 +382,19 @@ class FalconMambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTest
if param.requires_grad: if param.requires_grad:
# check if it's a ones like # check if it's a ones like
torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5) torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5)
else:
if param.requires_grad:
if (
"mixer.conv1d.weight" in name
or "mixer.dt_proj.weight" in name
or "mixer.out_proj.weight" in name
):
continue
self.assertIn(
((param.data.mean() * 1e9).round() / 1e9).item(),
[0.0, 1.0],
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)
@slow @slow
# Ignore copy # Ignore copy

View File

@@ -586,6 +586,8 @@ class GroundingDinoModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Tes
or "value_proj" in name or "value_proj" in name
or "output_proj" in name or "output_proj" in name
or "reference_points" in name or "reference_points" in name
or "vision_proj" in name
or "text_proj" in name
): ):
continue continue
self.assertIn( self.assertIn(

View File

@@ -24,7 +24,7 @@ from transformers.testing_utils import require_torch, require_torch_multi_gpu, s
from ...generation.test_utils import GenerationTesterMixin from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, ids_tensor from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
from ...test_pipeline_mixin import PipelineTesterMixin from ...test_pipeline_mixin import PipelineTesterMixin
@@ -326,9 +326,11 @@ class MambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
def test_initialization(self): def test_initialization(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common() config, _ = self.model_tester.prepare_config_and_inputs_for_common()
config.rescale_prenorm_residual = True
configs_no_init = _config_zero_init(config)
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
model = model_class(config=config) model = model_class(config=configs_no_init)
for name, param in model.named_parameters(): for name, param in model.named_parameters():
if "dt_proj.bias" in name: if "dt_proj.bias" in name:
dt = torch.exp( dt = torch.exp(
@@ -347,6 +349,19 @@ class MambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
if param.requires_grad: if param.requires_grad:
# check if it's a ones like # check if it's a ones like
torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5) torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5)
else:
if param.requires_grad:
if (
"mixer.conv1d.weight" in name
or "mixer.dt_proj.weight" in name
or "mixer.out_proj.weight" in name
):
continue
self.assertIn(
((param.data.mean() * 1e9).round() / 1e9).item(),
[0.0, 1.0],
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)
@slow @slow
def test_model_from_pretrained(self): def test_model_from_pretrained(self):

View File

@@ -13,6 +13,7 @@
# limitations under the License. # limitations under the License.
import math
import unittest import unittest
from transformers import AutoTokenizer, Mamba2Config, is_torch_available from transformers import AutoTokenizer, Mamba2Config, is_torch_available
@@ -28,7 +29,7 @@ from transformers.utils.import_utils import is_causal_conv1d_available, is_mamba
from ...generation.test_utils import GenerationTesterMixin from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, ids_tensor from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
from ...test_pipeline_mixin import PipelineTesterMixin from ...test_pipeline_mixin import PipelineTesterMixin
@@ -276,14 +277,37 @@ class Mamba2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
def test_initialization(self): def test_initialization(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common() config, _ = self.model_tester.prepare_config_and_inputs_for_common()
config.rescale_prenorm_residual = True
configs_no_init = _config_zero_init(config)
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
model = model_class(config=config) model = model_class(config=configs_no_init)
for name, param in model.named_parameters(): for name, param in model.named_parameters():
if "D" in name: if "dt_proj.bias" in name:
dt = torch.exp(
torch.tensor([0, 1]) * (math.log(config.time_step_max) - math.log(config.time_step_min))
+ math.log(config.time_step_min)
).clamp(min=config.time_step_floor)
inv_dt = dt + torch.log(-torch.expm1(-dt))
if param.requires_grad:
self.assertTrue(param.data.max().item() <= inv_dt[1])
self.assertTrue(param.data.min().item() >= inv_dt[0])
elif "A_log" in name:
A = torch.arange(1, config.num_heads + 1)
torch.testing.assert_close(param.data, torch.log(A), rtol=1e-5, atol=1e-5)
elif "D" in name:
if param.requires_grad: if param.requires_grad:
# check if it's a ones like # check if it's a ones like
torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5) torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5)
else:
if param.requires_grad:
if "mixer.conv1d.weight" in name or "mixer.dt_bias" in name or "mixer.out_proj.weight" in name:
continue
self.assertIn(
((param.data.mean() * 1e9).round() / 1e9).item(),
[0.0, 1.0],
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)
@unittest.skip(reason="Mamba 2 weights are not tied") @unittest.skip(reason="Mamba 2 weights are not tied")
def test_tied_weights_keys(self): def test_tied_weights_keys(self):

View File

@@ -629,6 +629,7 @@ class OmDetTurboModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
or "decoder.channel_projection_layers" in name or "decoder.channel_projection_layers" in name
or "query_position_head" in name or "query_position_head" in name
or "decoder.encoder_vision_features" in name or "decoder.encoder_vision_features" in name
or "language_backbone.text_projection" in name
): ):
continue continue
self.assertIn( self.assertIn(

View File

@@ -153,10 +153,18 @@ class TimmWrapperModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
def test_retain_grad_hidden_states_attentions(self): def test_retain_grad_hidden_states_attentions(self):
pass pass
@unittest.skip(reason="TimmWrapper initialization is managed on the timm side")
def test_can_init_all_missing_weights(self):
pass
@unittest.skip(reason="TimmWrapper initialization is managed on the timm side") @unittest.skip(reason="TimmWrapper initialization is managed on the timm side")
def test_initialization(self): def test_initialization(self):
pass pass
@unittest.skip(reason="TimmWrapper initialization is managed on the timm side")
def test_mismatched_shapes_have_properly_initialized_weights(self):
pass
@unittest.skip(reason="Need to use a timm model and there is no tiny model available.") @unittest.skip(reason="Need to use a timm model and there is no tiny model available.")
def test_model_is_small(self): def test_model_is_small(self):
pass pass

View File

@@ -855,7 +855,7 @@ class ModelTesterMixin:
# For now, skip everything older than 2025 and "important models" (too much models to patch otherwise) # For now, skip everything older than 2025 and "important models" (too much models to patch otherwise)
# Use `supports_cache_class` as a proxy to judge "important" models in order to prioritize them # Use `supports_cache_class` as a proxy to judge "important" models in order to prioritize them
# TODO: relax this as we patch more and more models # TODO: relax this as we patch more and more models
if addition_year < 2025 and not model_class._supports_cache_class: if addition_year < 2024 and not model_class._supports_cache_class:
self.skipTest(reason=f"{model_class} is not a priorited model for now.") self.skipTest(reason=f"{model_class} is not a priorited model for now.")
# Monkey patch the method to add a seed (we do it on PreTrainedModel._initialize_weights, which wraps # Monkey patch the method to add a seed (we do it on PreTrainedModel._initialize_weights, which wraps
@@ -895,6 +895,11 @@ class ModelTesterMixin:
model_from_config.state_dict().items(), model_from_pretrained.state_dict().items() model_from_config.state_dict().items(), model_from_pretrained.state_dict().items()
): ):
self.assertEqual(k1, k2, "The keys from each model should be the same") self.assertEqual(k1, k2, "The keys from each model should be the same")
# In case using torch.nn.utils.parametrizations on a module, we should skip the resulting keys
if re.search(r"\.parametrizations\..*?\.original[01]", k1):
continue
# Since we added the seed, they should be exactly the same (i.e. using allclose maybe be wrong due # Since we added the seed, they should be exactly the same (i.e. using allclose maybe be wrong due
# to very low std in init function) # to very low std in init function)
if not (v1 == v2).all(): if not (v1 == v2).all():