Fix modular edge case + modular sorting order (#35562)

* look-ahead negation * re add examples by default * Fix the bug in topological sort * Update create_dependency_mapping.py * start adding test * finalize test * more tests * style * style
2025-01-09 17:17:52 +01:00
parent d3fe9fa3fe
commit 46276f9a7f
9 changed files with 98 additions and 57 deletions
--- a/examples/modular-transformers/configuration_my_new_model.py
+++ b/examples/modular-transformers/configuration_my_new_model.py
@@ -43,7 +43,7 @@ class MyNewModelConfig(PretrainedConfig):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with. MyNewModel 1 supports up to 2048 tokens,
-            MyNewModel 2 up to 4096, CodeMyNewModel up to 16384.
+            MyNewModel 2 up to 4096, CodeLlama up to 16384.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
@@ -110,7 +110,7 @@ class MyNewModelConfig(PretrainedConfig):
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
        head_dim (`int`, *optional*):
-            The attention head dimension. If None, it will default to hidden_size // num_heads
+            The attention head dimension. If None, it will default to hidden_size // num_attention_heads

    ```python
    >>> from transformers import MyNewModelModel, MyNewModelConfig
--- a/examples/modular-transformers/modeling_dummy.py
+++ b/examples/modular-transformers/modeling_dummy.py
@@ -597,7 +597,7 @@ class DummyModel(DummyPreTrainedModel):
        output_attentions: bool,
    ):
        if self.config._attn_implementation == "flash_attention_2":
-            if attention_mask is not None and 0.0 in attention_mask:
+            if attention_mask is not None and (attention_mask == 0.0).any():
                return attention_mask
            return None

--- a/examples/modular-transformers/modeling_multimodal1.py
+++ b/examples/modular-transformers/modeling_multimodal1.py
@@ -597,7 +597,7 @@ class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
        output_attentions: bool,
    ):
        if self.config._attn_implementation == "flash_attention_2":
-            if attention_mask is not None and 0.0 in attention_mask:
+            if attention_mask is not None and (attention_mask == 0.0).any():
                return attention_mask
            return None

--- a/examples/modular-transformers/modeling_my_new_model2.py
+++ b/examples/modular-transformers/modeling_my_new_model2.py
@@ -602,7 +602,7 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
        output_attentions: bool,
    ):
        if self.config._attn_implementation == "flash_attention_2":
-            if attention_mask is not None and 0.0 in attention_mask:
+            if attention_mask is not None and (attention_mask == 0.0).any():
                return attention_mask
            return None

--- a/examples/modular-transformers/modeling_super.py
+++ b/examples/modular-transformers/modeling_super.py
@@ -519,7 +519,7 @@ class SuperModel(SuperPreTrainedModel):
        output_attentions: bool,
    ):
        if self.config._attn_implementation == "flash_attention_2":
-            if attention_mask is not None and 0.0 in attention_mask:
+            if attention_mask is not None and (attention_mask == 0.0).any():
                return attention_mask
            return None