Fix missing initializations for models created in 2024 (#38987)
* fix GroundingDino * fix SuperGlue * fix GroundingDino * fix MambaModel * fix OmDetTurbo * fix SegGpt * fix Qwen2Audio * fix Mamba2 * fix DabDetr * fix Dac * fix FalconMamba * skip timm initialization * fix Encodec and MusicgenMelody * fix Musicgen * skip timm initialization test * fix OmDetTurbo * clean the code Co-authored-by: Cyril Vallez <cyril.vallez@gmail.com> * add reviewed changes * add back timm * style * better check for parametrizations --------- Co-authored-by: Cyril Vallez <cyril.vallez@gmail.com>
This commit is contained in:
@@ -829,6 +829,9 @@ class DabDetrPreTrainedModel(PreTrainedModel):
|
|||||||
module.weight.data.normal_(mean=0.0, std=std)
|
module.weight.data.normal_(mean=0.0, std=std)
|
||||||
if module.bias is not None:
|
if module.bias is not None:
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
|
elif isinstance(module, nn.LayerNorm):
|
||||||
|
module.weight.data.fill_(1.0)
|
||||||
|
module.bias.data.zero_()
|
||||||
elif isinstance(module, nn.Embedding):
|
elif isinstance(module, nn.Embedding):
|
||||||
module.weight.data.normal_(mean=0.0, std=std)
|
module.weight.data.normal_(mean=0.0, std=std)
|
||||||
if module.padding_idx is not None:
|
if module.padding_idx is not None:
|
||||||
@@ -841,6 +844,8 @@ class DabDetrPreTrainedModel(PreTrainedModel):
|
|||||||
prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1)
|
prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1)
|
||||||
bias_value = -math.log((1 - prior_prob) / prior_prob)
|
bias_value = -math.log((1 - prior_prob) / prior_prob)
|
||||||
module.class_embed.bias.data.fill_(bias_value)
|
module.class_embed.bias.data.fill_(bias_value)
|
||||||
|
elif isinstance(module, nn.PReLU):
|
||||||
|
module.reset_parameters()
|
||||||
|
|
||||||
|
|
||||||
# Modified from transformers.models.detr.modeling_detr.DetrEncoder with Detr->DabDetr,DETR->ConditionalDETR
|
# Modified from transformers.models.detr.modeling_detr.DetrEncoder with Detr->DabDetr,DETR->ConditionalDETR
|
||||||
|
|||||||
@@ -480,6 +480,12 @@ class DacPreTrainedModel(PreTrainedAudioTokenizerBase):
|
|||||||
if isinstance(module, nn.Conv1d):
|
if isinstance(module, nn.Conv1d):
|
||||||
nn.init.trunc_normal_(module.weight, std=0.02)
|
nn.init.trunc_normal_(module.weight, std=0.02)
|
||||||
nn.init.constant_(module.bias, 0)
|
nn.init.constant_(module.bias, 0)
|
||||||
|
elif isinstance(module, Snake1d):
|
||||||
|
module.alpha.data.fill_(1.0)
|
||||||
|
elif isinstance(module, nn.ConvTranspose1d):
|
||||||
|
module.reset_parameters()
|
||||||
|
elif isinstance(module, nn.Embedding):
|
||||||
|
module.weight.data.normal_(mean=0.0, std=0.02)
|
||||||
|
|
||||||
def apply_weight_norm(self):
|
def apply_weight_norm(self):
|
||||||
weight_norm = nn.utils.weight_norm
|
weight_norm = nn.utils.weight_norm
|
||||||
|
|||||||
@@ -235,7 +235,7 @@ class EncodecLSTM(nn.Module):
|
|||||||
LSTM without worrying about the hidden state, nor the layout of the data. Expects input as convolutional layout.
|
LSTM without worrying about the hidden state, nor the layout of the data. Expects input as convolutional layout.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, dimension):
|
def __init__(self, config: EncodecConfig, dimension: int):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.lstm = nn.LSTM(dimension, dimension, config.num_lstm_layers)
|
self.lstm = nn.LSTM(dimension, dimension, config.num_lstm_layers)
|
||||||
|
|
||||||
@@ -452,11 +452,7 @@ class EncodecPreTrainedModel(PreTrainedModel):
|
|||||||
|
|
||||||
def _init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
"""Initialize the weights"""
|
"""Initialize the weights"""
|
||||||
if isinstance(module, nn.Linear):
|
if isinstance(module, nn.GroupNorm):
|
||||||
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
|
||||||
if module.bias is not None:
|
|
||||||
module.bias.data.zero_()
|
|
||||||
elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
|
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
module.weight.data.fill_(1.0)
|
module.weight.data.fill_(1.0)
|
||||||
elif isinstance(module, nn.Conv1d):
|
elif isinstance(module, nn.Conv1d):
|
||||||
@@ -464,10 +460,8 @@ class EncodecPreTrainedModel(PreTrainedModel):
|
|||||||
if module.bias is not None:
|
if module.bias is not None:
|
||||||
k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
|
k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
|
||||||
nn.init.uniform_(module.bias, a=-k, b=k)
|
nn.init.uniform_(module.bias, a=-k, b=k)
|
||||||
elif isinstance(module, nn.Embedding):
|
elif isinstance(module, nn.ConvTranspose1d):
|
||||||
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
module.reset_parameters()
|
||||||
if module.padding_idx is not None:
|
|
||||||
module.weight.data[module.padding_idx].zero_()
|
|
||||||
elif isinstance(module, nn.LSTM):
|
elif isinstance(module, nn.LSTM):
|
||||||
for name, param in module.named_parameters():
|
for name, param in module.named_parameters():
|
||||||
if "weight" in name:
|
if "weight" in name:
|
||||||
@@ -659,7 +653,7 @@ class EncodecModel(EncodecPreTrainedModel):
|
|||||||
|
|
||||||
def decode(
|
def decode(
|
||||||
self,
|
self,
|
||||||
audio_codes: torch.Tensor,
|
audio_codes: torch.LongTensor,
|
||||||
audio_scales: torch.Tensor,
|
audio_scales: torch.Tensor,
|
||||||
padding_mask: Optional[torch.Tensor] = None,
|
padding_mask: Optional[torch.Tensor] = None,
|
||||||
return_dict: Optional[bool] = None,
|
return_dict: Optional[bool] = None,
|
||||||
@@ -708,10 +702,10 @@ class EncodecModel(EncodecPreTrainedModel):
|
|||||||
@auto_docstring
|
@auto_docstring
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
input_values: torch.Tensor,
|
input_values: torch.FloatTensor,
|
||||||
padding_mask: Optional[torch.Tensor] = None,
|
padding_mask: Optional[torch.BoolTensor] = None,
|
||||||
bandwidth: Optional[float] = None,
|
bandwidth: Optional[float] = None,
|
||||||
audio_codes: Optional[torch.Tensor] = None,
|
audio_codes: Optional[torch.LongTensor] = None,
|
||||||
audio_scales: Optional[torch.Tensor] = None,
|
audio_scales: Optional[torch.Tensor] = None,
|
||||||
return_dict: Optional[bool] = None,
|
return_dict: Optional[bool] = None,
|
||||||
) -> Union[tuple[torch.Tensor, torch.Tensor], EncodecOutput]:
|
) -> Union[tuple[torch.Tensor, torch.Tensor], EncodecOutput]:
|
||||||
|
|||||||
@@ -445,9 +445,16 @@ class FalconMambaPreTrainedModel(PreTrainedModel):
|
|||||||
|
|
||||||
def _init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
"""Initialize the weights."""
|
"""Initialize the weights."""
|
||||||
|
std = self.config.initializer_range
|
||||||
if isinstance(module, FalconMambaMixer):
|
if isinstance(module, FalconMambaMixer):
|
||||||
|
# S4D real initialization. These are not discretized!
|
||||||
|
# The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
|
||||||
|
A = torch.arange(1, module.ssm_state_size + 1, dtype=torch.float32)[None, :]
|
||||||
|
A = A.expand(module.intermediate_size, -1).contiguous()
|
||||||
|
module.A_log.copy_(torch.log(A))
|
||||||
module.A_log._no_weight_decay = True
|
module.A_log._no_weight_decay = True
|
||||||
module.D._no_weight_decay = True
|
module.D._no_weight_decay = True
|
||||||
|
module.D.data.fill_(1.0)
|
||||||
|
|
||||||
dt_init_std = self.config.time_step_rank**-0.5 * self.config.time_step_scale
|
dt_init_std = self.config.time_step_rank**-0.5 * self.config.time_step_scale
|
||||||
if self.config.time_step_init_scheme == "constant":
|
if self.config.time_step_init_scheme == "constant":
|
||||||
@@ -462,16 +469,14 @@ class FalconMambaPreTrainedModel(PreTrainedModel):
|
|||||||
).clamp(min=self.config.time_step_floor)
|
).clamp(min=self.config.time_step_floor)
|
||||||
# # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
|
# # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
|
||||||
inv_dt = dt + torch.log(-torch.expm1(-dt))
|
inv_dt = dt + torch.log(-torch.expm1(-dt))
|
||||||
with torch.no_grad():
|
|
||||||
module.dt_proj.bias.copy_(inv_dt)
|
module.dt_proj.bias.copy_(inv_dt)
|
||||||
module.dt_proj.bias._no_reinit = True
|
module.dt_proj.bias._no_reinit = True
|
||||||
|
|
||||||
if isinstance(module, nn.Linear):
|
nn.init.kaiming_uniform_(module.conv1d.weight, a=math.sqrt(5))
|
||||||
if module.bias is not None:
|
if module.conv1d.bias is not None:
|
||||||
if not getattr(module.bias, "_no_reinit", False):
|
if not getattr(module.conv1d.bias, "_no_reinit", False):
|
||||||
nn.init.zeros_(module.bias)
|
nn.init.zeros_(module.conv1d.bias)
|
||||||
elif isinstance(module, nn.Embedding):
|
nn.init.kaiming_uniform_(module.out_proj.weight, a=math.sqrt(5))
|
||||||
nn.init.normal_(module.weight, std=self.config.initializer_range)
|
|
||||||
|
|
||||||
if self.config.rescale_prenorm_residual:
|
if self.config.rescale_prenorm_residual:
|
||||||
# Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
|
# Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
|
||||||
@@ -480,16 +485,24 @@ class FalconMambaPreTrainedModel(PreTrainedModel):
|
|||||||
# > -- GPT-2 :: https://openai.com/blog/better-language-models/
|
# > -- GPT-2 :: https://openai.com/blog/better-language-models/
|
||||||
#
|
#
|
||||||
# Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
|
# Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
|
||||||
for name, p in module.named_parameters():
|
|
||||||
if name in ["out_proj.weight"]:
|
|
||||||
# Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
|
# Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
|
||||||
# Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
|
# Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
|
||||||
# We need to reinit p since this code could be called multiple times
|
# We need to reinit p since this code could be called multiple times
|
||||||
# Having just p *= scale would repeatedly scale it down
|
# Having just p *= scale would repeatedly scale it down
|
||||||
nn.init.kaiming_uniform_(p, a=math.sqrt(5))
|
p = module.out_proj.weight
|
||||||
with torch.no_grad():
|
|
||||||
p /= math.sqrt(self.config.num_hidden_layers)
|
p /= math.sqrt(self.config.num_hidden_layers)
|
||||||
|
|
||||||
|
if isinstance(module, nn.Linear):
|
||||||
|
if not getattr(module.weight, "_no_reinit", False):
|
||||||
|
nn.init.normal_(module.weight, std=std)
|
||||||
|
if module.bias is not None:
|
||||||
|
if not getattr(module.bias, "_no_reinit", False):
|
||||||
|
nn.init.zeros_(module.bias)
|
||||||
|
elif isinstance(module, FalconMambaRMSNorm):
|
||||||
|
module.weight.data.fill_(1.0)
|
||||||
|
elif isinstance(module, nn.Embedding):
|
||||||
|
nn.init.normal_(module.weight, std=std)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@auto_docstring(
|
@auto_docstring(
|
||||||
|
|||||||
@@ -1414,16 +1414,18 @@ class GroundingDinoPreTrainedModel(PreTrainedModel):
|
|||||||
module.out_vision_proj.bias.data.fill_(0)
|
module.out_vision_proj.bias.data.fill_(0)
|
||||||
nn.init.xavier_uniform_(module.out_text_proj.weight)
|
nn.init.xavier_uniform_(module.out_text_proj.weight)
|
||||||
module.out_text_proj.bias.data.fill_(0)
|
module.out_text_proj.bias.data.fill_(0)
|
||||||
elif isinstance(module, (GroundingDinoEncoderLayer, GroundingDinoDecoderLayer)):
|
elif isinstance(module, GroundingDinoFusionLayer):
|
||||||
for p in module.parameters():
|
module.vision_param.data.fill_(1e-4)
|
||||||
if p.dim() > 1:
|
module.text_param.data.fill_(1e-4)
|
||||||
nn.init.normal_(p, mean=0.0, std=std)
|
|
||||||
elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
|
elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
|
||||||
# Slightly different from the TF version which uses truncated_normal for initialization
|
# Slightly different from the TF version which uses truncated_normal for initialization
|
||||||
# cf https://github.com/pytorch/pytorch/pull/5617
|
# cf https://github.com/pytorch/pytorch/pull/5617
|
||||||
module.weight.data.normal_(mean=0.0, std=std)
|
module.weight.data.normal_(mean=0.0, std=std)
|
||||||
if module.bias is not None:
|
if module.bias is not None:
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
|
elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
|
||||||
|
module.weight.data.fill_(1.0)
|
||||||
|
module.bias.data.zero_()
|
||||||
elif isinstance(module, nn.Embedding):
|
elif isinstance(module, nn.Embedding):
|
||||||
module.weight.data.normal_(mean=0.0, std=std)
|
module.weight.data.normal_(mean=0.0, std=std)
|
||||||
if module.padding_idx is not None:
|
if module.padding_idx is not None:
|
||||||
|
|||||||
@@ -382,9 +382,16 @@ class MambaPreTrainedModel(PreTrainedModel):
|
|||||||
|
|
||||||
def _init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
"""Initialize the weights."""
|
"""Initialize the weights."""
|
||||||
|
std = self.config.initializer_range
|
||||||
if isinstance(module, MambaMixer):
|
if isinstance(module, MambaMixer):
|
||||||
|
# S4D real initialization. These are not discretized!
|
||||||
|
# The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
|
||||||
|
A = torch.arange(1, module.ssm_state_size + 1, dtype=torch.float32)[None, :]
|
||||||
|
A = A.expand(module.intermediate_size, -1).contiguous()
|
||||||
|
module.A_log.copy_(torch.log(A))
|
||||||
module.A_log._no_weight_decay = True
|
module.A_log._no_weight_decay = True
|
||||||
module.D._no_weight_decay = True
|
module.D._no_weight_decay = True
|
||||||
|
module.D.data.fill_(1.0)
|
||||||
|
|
||||||
dt_init_std = self.config.time_step_rank**-0.5 * self.config.time_step_scale
|
dt_init_std = self.config.time_step_rank**-0.5 * self.config.time_step_scale
|
||||||
if self.config.time_step_init_scheme == "constant":
|
if self.config.time_step_init_scheme == "constant":
|
||||||
@@ -399,16 +406,14 @@ class MambaPreTrainedModel(PreTrainedModel):
|
|||||||
).clamp(min=self.config.time_step_floor)
|
).clamp(min=self.config.time_step_floor)
|
||||||
# # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
|
# # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
|
||||||
inv_dt = dt + torch.log(-torch.expm1(-dt))
|
inv_dt = dt + torch.log(-torch.expm1(-dt))
|
||||||
with torch.no_grad():
|
|
||||||
module.dt_proj.bias.copy_(inv_dt)
|
module.dt_proj.bias.copy_(inv_dt)
|
||||||
module.dt_proj.bias._no_reinit = True
|
module.dt_proj.bias._no_reinit = True
|
||||||
|
|
||||||
if isinstance(module, nn.Linear):
|
nn.init.kaiming_uniform_(module.conv1d.weight, a=math.sqrt(5))
|
||||||
if module.bias is not None:
|
if module.conv1d.bias is not None:
|
||||||
if not getattr(module.bias, "_no_reinit", False):
|
if not getattr(module.conv1d.bias, "_no_reinit", False):
|
||||||
nn.init.zeros_(module.bias)
|
nn.init.zeros_(module.conv1d.bias)
|
||||||
elif isinstance(module, nn.Embedding):
|
nn.init.kaiming_uniform_(module.out_proj.weight, a=math.sqrt(5))
|
||||||
nn.init.normal_(module.weight, std=self.config.initializer_range)
|
|
||||||
|
|
||||||
if self.config.rescale_prenorm_residual:
|
if self.config.rescale_prenorm_residual:
|
||||||
# Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
|
# Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
|
||||||
@@ -417,16 +422,24 @@ class MambaPreTrainedModel(PreTrainedModel):
|
|||||||
# > -- GPT-2 :: https://openai.com/blog/better-language-models/
|
# > -- GPT-2 :: https://openai.com/blog/better-language-models/
|
||||||
#
|
#
|
||||||
# Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
|
# Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
|
||||||
for name, p in module.named_parameters():
|
|
||||||
if name in ["out_proj.weight"]:
|
|
||||||
# Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
|
# Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
|
||||||
# Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
|
# Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
|
||||||
# We need to reinit p since this code could be called multiple times
|
# We need to reinit p since this code could be called multiple times
|
||||||
# Having just p *= scale would repeatedly scale it down
|
# Having just p *= scale would repeatedly scale it down
|
||||||
nn.init.kaiming_uniform_(p, a=math.sqrt(5))
|
p = module.out_proj.weight
|
||||||
with torch.no_grad():
|
|
||||||
p /= math.sqrt(self.config.num_hidden_layers)
|
p /= math.sqrt(self.config.num_hidden_layers)
|
||||||
|
|
||||||
|
if isinstance(module, nn.Linear):
|
||||||
|
if not getattr(module.weight, "_no_reinit", False):
|
||||||
|
nn.init.normal_(module.weight, std=std)
|
||||||
|
if module.bias is not None:
|
||||||
|
if not getattr(module.bias, "_no_reinit", False):
|
||||||
|
nn.init.zeros_(module.bias)
|
||||||
|
elif isinstance(module, MambaRMSNorm):
|
||||||
|
module.weight.data.fill_(1.0)
|
||||||
|
elif isinstance(module, nn.Embedding):
|
||||||
|
nn.init.normal_(module.weight, std=std)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@auto_docstring(
|
@auto_docstring(
|
||||||
|
|||||||
@@ -721,9 +721,15 @@ class Mamba2PreTrainedModel(PreTrainedModel):
|
|||||||
|
|
||||||
def _init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
"""Initialize the weights."""
|
"""Initialize the weights."""
|
||||||
|
std = self.config.initializer_range
|
||||||
if isinstance(module, Mamba2Mixer):
|
if isinstance(module, Mamba2Mixer):
|
||||||
|
# S4D real initialization. These are not discretized!
|
||||||
|
# The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
|
||||||
|
A = torch.arange(1, self.config.num_heads + 1)
|
||||||
|
module.A_log.copy_(torch.log(A))
|
||||||
module.A_log._no_weight_decay = True
|
module.A_log._no_weight_decay = True
|
||||||
module.D._no_weight_decay = True
|
module.D._no_weight_decay = True
|
||||||
|
module.D.data.fill_(1.0)
|
||||||
|
|
||||||
dt = torch.exp(
|
dt = torch.exp(
|
||||||
torch.rand(self.config.num_heads)
|
torch.rand(self.config.num_heads)
|
||||||
@@ -733,16 +739,14 @@ class Mamba2PreTrainedModel(PreTrainedModel):
|
|||||||
|
|
||||||
# # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
|
# # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
|
||||||
inv_dt = dt + torch.log(-torch.expm1(-dt))
|
inv_dt = dt + torch.log(-torch.expm1(-dt))
|
||||||
with torch.no_grad():
|
|
||||||
module.dt_bias.copy_(inv_dt)
|
module.dt_bias.copy_(inv_dt)
|
||||||
module.dt_bias._no_reinit = True
|
module.dt_bias._no_reinit = True
|
||||||
|
|
||||||
if isinstance(module, nn.Linear):
|
nn.init.kaiming_uniform_(module.conv1d.weight, a=math.sqrt(5))
|
||||||
if module.bias is not None:
|
if module.conv1d.bias is not None:
|
||||||
if not getattr(module.bias, "_no_reinit", False):
|
if not getattr(module.conv1d.bias, "_no_reinit", False):
|
||||||
nn.init.zeros_(module.bias)
|
nn.init.zeros_(module.conv1d.bias)
|
||||||
elif isinstance(module, nn.Embedding):
|
nn.init.kaiming_uniform_(module.out_proj.weight, a=math.sqrt(5))
|
||||||
nn.init.normal_(module.weight, std=self.config.initializer_range)
|
|
||||||
|
|
||||||
if self.config.rescale_prenorm_residual:
|
if self.config.rescale_prenorm_residual:
|
||||||
# Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
|
# Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
|
||||||
@@ -751,16 +755,24 @@ class Mamba2PreTrainedModel(PreTrainedModel):
|
|||||||
# > -- GPT-2 :: https://openai.com/blog/better-language-models/
|
# > -- GPT-2 :: https://openai.com/blog/better-language-models/
|
||||||
#
|
#
|
||||||
# Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
|
# Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
|
||||||
for name, p in module.named_parameters():
|
|
||||||
if name in ["out_proj.weight"]:
|
|
||||||
# Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
|
# Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
|
||||||
# Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
|
# Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
|
||||||
# We need to reinit p since this code could be called multiple times
|
# We need to reinit p since this code could be called multiple times
|
||||||
# Having just p *= scale would repeatedly scale it down
|
# Having just p *= scale would repeatedly scale it down
|
||||||
nn.init.kaiming_uniform_(p, a=math.sqrt(5))
|
p = module.out_proj.weight
|
||||||
with torch.no_grad():
|
|
||||||
p /= math.sqrt(self.config.num_hidden_layers)
|
p /= math.sqrt(self.config.num_hidden_layers)
|
||||||
|
|
||||||
|
if isinstance(module, nn.Linear):
|
||||||
|
if not getattr(module.weight, "_no_reinit", False):
|
||||||
|
nn.init.normal_(module.weight, std=std)
|
||||||
|
if module.bias is not None:
|
||||||
|
if not getattr(module.bias, "_no_reinit", False):
|
||||||
|
nn.init.zeros_(module.bias)
|
||||||
|
elif isinstance(module, (Mamba2RMSNorm, MambaRMSNormGated)):
|
||||||
|
module.weight.data.fill_(1.0)
|
||||||
|
elif isinstance(module, nn.Embedding):
|
||||||
|
nn.init.normal_(module.weight, std=std)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@auto_docstring(
|
@auto_docstring(
|
||||||
|
|||||||
@@ -440,10 +440,13 @@ class MusicgenPreTrainedModel(PreTrainedModel):
|
|||||||
|
|
||||||
def _init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
std = self.config.initializer_factor
|
std = self.config.initializer_factor
|
||||||
if isinstance(module, (nn.Linear, nn.Conv1d)):
|
if isinstance(module, nn.Linear):
|
||||||
module.weight.data.normal_(mean=0.0, std=std)
|
module.weight.data.normal_(mean=0.0, std=std)
|
||||||
if module.bias is not None:
|
if module.bias is not None:
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
|
elif isinstance(module, nn.LayerNorm):
|
||||||
|
module.weight.data.fill_(1.0)
|
||||||
|
module.bias.data.zero_()
|
||||||
elif isinstance(module, nn.Embedding):
|
elif isinstance(module, nn.Embedding):
|
||||||
module.weight.data.normal_(mean=0.0, std=std)
|
module.weight.data.normal_(mean=0.0, std=std)
|
||||||
if module.padding_idx is not None:
|
if module.padding_idx is not None:
|
||||||
|
|||||||
@@ -406,10 +406,13 @@ class MusicgenMelodyPreTrainedModel(PreTrainedModel):
|
|||||||
|
|
||||||
def _init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
std = self.config.initializer_factor
|
std = self.config.initializer_factor
|
||||||
if isinstance(module, (nn.Linear, nn.Conv1d)):
|
if isinstance(module, nn.Linear):
|
||||||
module.weight.data.normal_(mean=0.0, std=std)
|
module.weight.data.normal_(mean=0.0, std=std)
|
||||||
if module.bias is not None:
|
if module.bias is not None:
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
|
elif isinstance(module, nn.LayerNorm):
|
||||||
|
module.weight.data.fill_(1.0)
|
||||||
|
module.bias.data.zero_()
|
||||||
elif isinstance(module, nn.Embedding):
|
elif isinstance(module, nn.Embedding):
|
||||||
module.weight.data.normal_(mean=0.0, std=std)
|
module.weight.data.normal_(mean=0.0, std=std)
|
||||||
if module.padding_idx is not None:
|
if module.padding_idx is not None:
|
||||||
@@ -1286,7 +1289,7 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel, GenerationMixin):
|
|||||||
The text encoder model that encodes text into hidden states for conditioning.
|
The text encoder model that encodes text into hidden states for conditioning.
|
||||||
audio_encoder (`PreTrainedModel`, *optional*):
|
audio_encoder (`PreTrainedModel`, *optional*):
|
||||||
The audio encoder model that encodes audio into hidden states for conditioning.
|
The audio encoder model that encodes audio into hidden states for conditioning.
|
||||||
decoder (`MusicgenForCausalLM`, *optional*):
|
decoder (`MusicgenMelodyForCausalLM`, *optional*):
|
||||||
The decoder model that generates audio tokens based on conditioning signals.
|
The decoder model that generates audio tokens based on conditioning signals.
|
||||||
"""
|
"""
|
||||||
if config is None and None in (text_encoder, audio_encoder, decoder):
|
if config is None and None in (text_encoder, audio_encoder, decoder):
|
||||||
|
|||||||
@@ -1006,10 +1006,15 @@ class OmDetTurboPreTrainedModel(PreTrainedModel):
|
|||||||
nn.init.xavier_uniform_(module.query_position_head.layers[1].weight)
|
nn.init.xavier_uniform_(module.query_position_head.layers[1].weight)
|
||||||
for layer in module.channel_projection_layers:
|
for layer in module.channel_projection_layers:
|
||||||
nn.init.xavier_uniform_(layer[0].weight)
|
nn.init.xavier_uniform_(layer[0].weight)
|
||||||
|
elif isinstance(module, OmDetTurboLanguageBackbone):
|
||||||
|
nn.init.normal_(module.text_projection, std=self.config.text_projection_in_dim**-0.5)
|
||||||
elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
|
elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
|
||||||
module.weight.data.normal_(mean=0.0, std=self.config.init_std)
|
module.weight.data.normal_(mean=0.0, std=self.config.init_std)
|
||||||
if module.bias is not None:
|
if module.bias is not None:
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
|
elif isinstance(module, nn.LayerNorm):
|
||||||
|
module.weight.data.fill_(1.0)
|
||||||
|
module.bias.data.zero_()
|
||||||
|
|
||||||
def _set_gradient_checkpointing(self, module, value=False):
|
def _set_gradient_checkpointing(self, module, value=False):
|
||||||
if isinstance(module, OmDetTurboDecoder):
|
if isinstance(module, OmDetTurboDecoder):
|
||||||
|
|||||||
@@ -283,6 +283,9 @@ class Qwen2AudioPreTrainedModel(PreTrainedModel):
|
|||||||
module.weight.data.normal_(mean=0.0, std=std)
|
module.weight.data.normal_(mean=0.0, std=std)
|
||||||
if module.bias is not None:
|
if module.bias is not None:
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
|
elif isinstance(module, nn.LayerNorm):
|
||||||
|
module.weight.data.fill_(1.0)
|
||||||
|
module.bias.data.zero_()
|
||||||
elif isinstance(module, nn.Embedding):
|
elif isinstance(module, nn.Embedding):
|
||||||
module.weight.data.normal_(mean=0.0, std=std)
|
module.weight.data.normal_(mean=0.0, std=std)
|
||||||
if module.padding_idx is not None:
|
if module.padding_idx is not None:
|
||||||
|
|||||||
@@ -604,7 +604,7 @@ class SegGptPreTrainedModel(PreTrainedModel):
|
|||||||
supports_gradient_checkpointing = True
|
supports_gradient_checkpointing = True
|
||||||
_no_split_modules = ["SegGptEmbeddings", "SegGptLayer"]
|
_no_split_modules = ["SegGptEmbeddings", "SegGptLayer"]
|
||||||
|
|
||||||
def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
|
def _init_weights(self, module: nn.Module) -> None:
|
||||||
"""Initialize the weights"""
|
"""Initialize the weights"""
|
||||||
std = self.config.initializer_range
|
std = self.config.initializer_range
|
||||||
if isinstance(module, (nn.Linear, nn.Conv2d)):
|
if isinstance(module, (nn.Linear, nn.Conv2d)):
|
||||||
@@ -615,7 +615,7 @@ class SegGptPreTrainedModel(PreTrainedModel):
|
|||||||
)
|
)
|
||||||
if module.bias is not None:
|
if module.bias is not None:
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
elif isinstance(module, nn.LayerNorm):
|
elif isinstance(module, (nn.LayerNorm, SegGptLayerNorm)):
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
module.weight.data.fill_(1.0)
|
module.weight.data.fill_(1.0)
|
||||||
elif isinstance(module, SegGptAttention):
|
elif isinstance(module, SegGptAttention):
|
||||||
|
|||||||
@@ -551,17 +551,18 @@ class SuperGluePreTrainedModel(PreTrainedModel):
|
|||||||
|
|
||||||
def _init_weights(self, module: nn.Module) -> None:
|
def _init_weights(self, module: nn.Module) -> None:
|
||||||
"""Initialize the weights"""
|
"""Initialize the weights"""
|
||||||
if isinstance(module, (nn.Linear, nn.Conv2d, nn.Conv1d)):
|
if isinstance(module, (nn.Linear, nn.Conv2d)):
|
||||||
# Slightly different from the TF version which uses truncated_normal for initialization
|
# Slightly different from the TF version which uses truncated_normal for initialization
|
||||||
# cf https://github.com/pytorch/pytorch/pull/5617
|
# cf https://github.com/pytorch/pytorch/pull/5617
|
||||||
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
||||||
if module.bias is not None:
|
if module.bias is not None:
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
elif isinstance(module, nn.LayerNorm):
|
elif isinstance(module, nn.BatchNorm1d):
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
module.weight.data.fill_(1.0)
|
module.weight.data.fill_(1.0)
|
||||||
elif isinstance(module, SuperGlueMultiLayerPerceptron):
|
|
||||||
nn.init.constant_(module.linear.bias, 0.0)
|
if hasattr(module, "bin_score"):
|
||||||
|
module.bin_score.data.fill_(1.0)
|
||||||
|
|
||||||
|
|
||||||
@auto_docstring(
|
@auto_docstring(
|
||||||
|
|||||||
@@ -310,12 +310,13 @@ class EncodecModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
|
|||||||
|
|
||||||
def test_feed_forward_chunking(self):
|
def test_feed_forward_chunking(self):
|
||||||
(original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
|
(original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
# original_config.norm_type = "time_group_norm"
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
torch.manual_seed(0)
|
torch.manual_seed(0)
|
||||||
config = copy.deepcopy(original_config)
|
config = copy.deepcopy(original_config)
|
||||||
config.chunk_length_s = None
|
config.chunk_length_s = None
|
||||||
config.overlap = None
|
config.overlap = None
|
||||||
config.sampling_rate = 10
|
config.sampling_rate = 20
|
||||||
|
|
||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
model.to(torch_device)
|
model.to(torch_device)
|
||||||
@@ -326,9 +327,9 @@ class EncodecModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
|
|||||||
hidden_states_no_chunk = model(**inputs)[1]
|
hidden_states_no_chunk = model(**inputs)[1]
|
||||||
|
|
||||||
torch.manual_seed(0)
|
torch.manual_seed(0)
|
||||||
config.chunk_length_s = 1
|
config.chunk_length_s = 2
|
||||||
config.overlap = 0
|
config.overlap = 0
|
||||||
config.sampling_rate = 10
|
config.sampling_rate = 20
|
||||||
|
|
||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
model.to(torch_device)
|
model.to(torch_device)
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ from transformers.testing_utils import (
|
|||||||
|
|
||||||
from ...generation.test_utils import GenerationTesterMixin
|
from ...generation.test_utils import GenerationTesterMixin
|
||||||
from ...test_configuration_common import ConfigTester
|
from ...test_configuration_common import ConfigTester
|
||||||
from ...test_modeling_common import ModelTesterMixin, ids_tensor
|
from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
|
||||||
from ...test_pipeline_mixin import PipelineTesterMixin
|
from ...test_pipeline_mixin import PipelineTesterMixin
|
||||||
|
|
||||||
|
|
||||||
@@ -359,9 +359,11 @@ class FalconMambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTest
|
|||||||
|
|
||||||
def test_initialization(self):
|
def test_initialization(self):
|
||||||
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
config.rescale_prenorm_residual = True
|
||||||
|
|
||||||
|
configs_no_init = _config_zero_init(config)
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
model = model_class(config=config)
|
model = model_class(config=configs_no_init)
|
||||||
for name, param in model.named_parameters():
|
for name, param in model.named_parameters():
|
||||||
if "dt_proj.bias" in name:
|
if "dt_proj.bias" in name:
|
||||||
dt = torch.exp(
|
dt = torch.exp(
|
||||||
@@ -380,6 +382,19 @@ class FalconMambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTest
|
|||||||
if param.requires_grad:
|
if param.requires_grad:
|
||||||
# check if it's a ones like
|
# check if it's a ones like
|
||||||
torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5)
|
torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5)
|
||||||
|
else:
|
||||||
|
if param.requires_grad:
|
||||||
|
if (
|
||||||
|
"mixer.conv1d.weight" in name
|
||||||
|
or "mixer.dt_proj.weight" in name
|
||||||
|
or "mixer.out_proj.weight" in name
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
self.assertIn(
|
||||||
|
((param.data.mean() * 1e9).round() / 1e9).item(),
|
||||||
|
[0.0, 1.0],
|
||||||
|
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||||
|
)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
# Ignore copy
|
# Ignore copy
|
||||||
|
|||||||
@@ -586,6 +586,8 @@ class GroundingDinoModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Tes
|
|||||||
or "value_proj" in name
|
or "value_proj" in name
|
||||||
or "output_proj" in name
|
or "output_proj" in name
|
||||||
or "reference_points" in name
|
or "reference_points" in name
|
||||||
|
or "vision_proj" in name
|
||||||
|
or "text_proj" in name
|
||||||
):
|
):
|
||||||
continue
|
continue
|
||||||
self.assertIn(
|
self.assertIn(
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ from transformers.testing_utils import require_torch, require_torch_multi_gpu, s
|
|||||||
|
|
||||||
from ...generation.test_utils import GenerationTesterMixin
|
from ...generation.test_utils import GenerationTesterMixin
|
||||||
from ...test_configuration_common import ConfigTester
|
from ...test_configuration_common import ConfigTester
|
||||||
from ...test_modeling_common import ModelTesterMixin, ids_tensor
|
from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
|
||||||
from ...test_pipeline_mixin import PipelineTesterMixin
|
from ...test_pipeline_mixin import PipelineTesterMixin
|
||||||
|
|
||||||
|
|
||||||
@@ -326,9 +326,11 @@ class MambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
|
|||||||
|
|
||||||
def test_initialization(self):
|
def test_initialization(self):
|
||||||
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
config.rescale_prenorm_residual = True
|
||||||
|
|
||||||
|
configs_no_init = _config_zero_init(config)
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
model = model_class(config=config)
|
model = model_class(config=configs_no_init)
|
||||||
for name, param in model.named_parameters():
|
for name, param in model.named_parameters():
|
||||||
if "dt_proj.bias" in name:
|
if "dt_proj.bias" in name:
|
||||||
dt = torch.exp(
|
dt = torch.exp(
|
||||||
@@ -347,6 +349,19 @@ class MambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
|
|||||||
if param.requires_grad:
|
if param.requires_grad:
|
||||||
# check if it's a ones like
|
# check if it's a ones like
|
||||||
torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5)
|
torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5)
|
||||||
|
else:
|
||||||
|
if param.requires_grad:
|
||||||
|
if (
|
||||||
|
"mixer.conv1d.weight" in name
|
||||||
|
or "mixer.dt_proj.weight" in name
|
||||||
|
or "mixer.out_proj.weight" in name
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
self.assertIn(
|
||||||
|
((param.data.mean() * 1e9).round() / 1e9).item(),
|
||||||
|
[0.0, 1.0],
|
||||||
|
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||||
|
)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
|
|||||||
@@ -13,6 +13,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
import math
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from transformers import AutoTokenizer, Mamba2Config, is_torch_available
|
from transformers import AutoTokenizer, Mamba2Config, is_torch_available
|
||||||
@@ -28,7 +29,7 @@ from transformers.utils.import_utils import is_causal_conv1d_available, is_mamba
|
|||||||
|
|
||||||
from ...generation.test_utils import GenerationTesterMixin
|
from ...generation.test_utils import GenerationTesterMixin
|
||||||
from ...test_configuration_common import ConfigTester
|
from ...test_configuration_common import ConfigTester
|
||||||
from ...test_modeling_common import ModelTesterMixin, ids_tensor
|
from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
|
||||||
from ...test_pipeline_mixin import PipelineTesterMixin
|
from ...test_pipeline_mixin import PipelineTesterMixin
|
||||||
|
|
||||||
|
|
||||||
@@ -276,14 +277,37 @@ class Mamba2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
|
|||||||
|
|
||||||
def test_initialization(self):
|
def test_initialization(self):
|
||||||
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
config.rescale_prenorm_residual = True
|
||||||
|
|
||||||
|
configs_no_init = _config_zero_init(config)
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
model = model_class(config=config)
|
model = model_class(config=configs_no_init)
|
||||||
for name, param in model.named_parameters():
|
for name, param in model.named_parameters():
|
||||||
if "D" in name:
|
if "dt_proj.bias" in name:
|
||||||
|
dt = torch.exp(
|
||||||
|
torch.tensor([0, 1]) * (math.log(config.time_step_max) - math.log(config.time_step_min))
|
||||||
|
+ math.log(config.time_step_min)
|
||||||
|
).clamp(min=config.time_step_floor)
|
||||||
|
inv_dt = dt + torch.log(-torch.expm1(-dt))
|
||||||
|
if param.requires_grad:
|
||||||
|
self.assertTrue(param.data.max().item() <= inv_dt[1])
|
||||||
|
self.assertTrue(param.data.min().item() >= inv_dt[0])
|
||||||
|
elif "A_log" in name:
|
||||||
|
A = torch.arange(1, config.num_heads + 1)
|
||||||
|
torch.testing.assert_close(param.data, torch.log(A), rtol=1e-5, atol=1e-5)
|
||||||
|
elif "D" in name:
|
||||||
if param.requires_grad:
|
if param.requires_grad:
|
||||||
# check if it's a ones like
|
# check if it's a ones like
|
||||||
torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5)
|
torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5)
|
||||||
|
else:
|
||||||
|
if param.requires_grad:
|
||||||
|
if "mixer.conv1d.weight" in name or "mixer.dt_bias" in name or "mixer.out_proj.weight" in name:
|
||||||
|
continue
|
||||||
|
self.assertIn(
|
||||||
|
((param.data.mean() * 1e9).round() / 1e9).item(),
|
||||||
|
[0.0, 1.0],
|
||||||
|
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||||
|
)
|
||||||
|
|
||||||
@unittest.skip(reason="Mamba 2 weights are not tied")
|
@unittest.skip(reason="Mamba 2 weights are not tied")
|
||||||
def test_tied_weights_keys(self):
|
def test_tied_weights_keys(self):
|
||||||
|
|||||||
@@ -629,6 +629,7 @@ class OmDetTurboModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
|
|||||||
or "decoder.channel_projection_layers" in name
|
or "decoder.channel_projection_layers" in name
|
||||||
or "query_position_head" in name
|
or "query_position_head" in name
|
||||||
or "decoder.encoder_vision_features" in name
|
or "decoder.encoder_vision_features" in name
|
||||||
|
or "language_backbone.text_projection" in name
|
||||||
):
|
):
|
||||||
continue
|
continue
|
||||||
self.assertIn(
|
self.assertIn(
|
||||||
|
|||||||
@@ -153,10 +153,18 @@ class TimmWrapperModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
|
|||||||
def test_retain_grad_hidden_states_attentions(self):
|
def test_retain_grad_hidden_states_attentions(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@unittest.skip(reason="TimmWrapper initialization is managed on the timm side")
|
||||||
|
def test_can_init_all_missing_weights(self):
|
||||||
|
pass
|
||||||
|
|
||||||
@unittest.skip(reason="TimmWrapper initialization is managed on the timm side")
|
@unittest.skip(reason="TimmWrapper initialization is managed on the timm side")
|
||||||
def test_initialization(self):
|
def test_initialization(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@unittest.skip(reason="TimmWrapper initialization is managed on the timm side")
|
||||||
|
def test_mismatched_shapes_have_properly_initialized_weights(self):
|
||||||
|
pass
|
||||||
|
|
||||||
@unittest.skip(reason="Need to use a timm model and there is no tiny model available.")
|
@unittest.skip(reason="Need to use a timm model and there is no tiny model available.")
|
||||||
def test_model_is_small(self):
|
def test_model_is_small(self):
|
||||||
pass
|
pass
|
||||||
|
|||||||
@@ -855,7 +855,7 @@ class ModelTesterMixin:
|
|||||||
# For now, skip everything older than 2025 and "important models" (too much models to patch otherwise)
|
# For now, skip everything older than 2025 and "important models" (too much models to patch otherwise)
|
||||||
# Use `supports_cache_class` as a proxy to judge "important" models in order to prioritize them
|
# Use `supports_cache_class` as a proxy to judge "important" models in order to prioritize them
|
||||||
# TODO: relax this as we patch more and more models
|
# TODO: relax this as we patch more and more models
|
||||||
if addition_year < 2025 and not model_class._supports_cache_class:
|
if addition_year < 2024 and not model_class._supports_cache_class:
|
||||||
self.skipTest(reason=f"{model_class} is not a priorited model for now.")
|
self.skipTest(reason=f"{model_class} is not a priorited model for now.")
|
||||||
|
|
||||||
# Monkey patch the method to add a seed (we do it on PreTrainedModel._initialize_weights, which wraps
|
# Monkey patch the method to add a seed (we do it on PreTrainedModel._initialize_weights, which wraps
|
||||||
@@ -895,6 +895,11 @@ class ModelTesterMixin:
|
|||||||
model_from_config.state_dict().items(), model_from_pretrained.state_dict().items()
|
model_from_config.state_dict().items(), model_from_pretrained.state_dict().items()
|
||||||
):
|
):
|
||||||
self.assertEqual(k1, k2, "The keys from each model should be the same")
|
self.assertEqual(k1, k2, "The keys from each model should be the same")
|
||||||
|
|
||||||
|
# In case using torch.nn.utils.parametrizations on a module, we should skip the resulting keys
|
||||||
|
if re.search(r"\.parametrizations\..*?\.original[01]", k1):
|
||||||
|
continue
|
||||||
|
|
||||||
# Since we added the seed, they should be exactly the same (i.e. using allclose maybe be wrong due
|
# Since we added the seed, they should be exactly the same (i.e. using allclose maybe be wrong due
|
||||||
# to very low std in init function)
|
# to very low std in init function)
|
||||||
if not (v1 == v2).all():
|
if not (v1 == v2).all():
|
||||||
|
|||||||
Reference in New Issue
Block a user