Fix modular edge case + modular sorting order (#35562)

* look-ahead negation

* re add examples by default

* Fix the bug in topological sort

* Update create_dependency_mapping.py

* start adding test

* finalize test

* more tests

* style

* style
This commit is contained in:
Cyril Vallez
2025-01-09 17:17:52 +01:00
committed by GitHub
parent d3fe9fa3fe
commit 46276f9a7f
9 changed files with 98 additions and 57 deletions

View File

@@ -43,7 +43,7 @@ class MyNewModelConfig(PretrainedConfig):
The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with. MyNewModel 1 supports up to 2048 tokens,
MyNewModel 2 up to 4096, CodeMyNewModel up to 16384.
MyNewModel 2 up to 4096, CodeLlama up to 16384.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
@@ -110,7 +110,7 @@ class MyNewModelConfig(PretrainedConfig):
mlp_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
head_dim (`int`, *optional*):
The attention head dimension. If None, it will default to hidden_size // num_heads
The attention head dimension. If None, it will default to hidden_size // num_attention_heads
```python
>>> from transformers import MyNewModelModel, MyNewModelConfig

View File

@@ -597,7 +597,7 @@ class DummyModel(DummyPreTrainedModel):
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
if attention_mask is not None and 0.0 in attention_mask:
if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None

View File

@@ -597,7 +597,7 @@ class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
if attention_mask is not None and 0.0 in attention_mask:
if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None

View File

@@ -602,7 +602,7 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
if attention_mask is not None and 0.0 in attention_mask:
if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None

View File

@@ -519,7 +519,7 @@ class SuperModel(SuperPreTrainedModel):
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
if attention_mask is not None and 0.0 in attention_mask:
if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None