Never fallback to eager implicitly (#38327)

* remove arg everywhere * Update warnings * add more models * Update sdpa_attention.py * fix style * fix * readd warnings but not for flex * Update test_modeling_common.py * skip * fix --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
2025-05-23 19:48:01 +02:00
parent e64ed0304c
commit e0aad278fe
73 changed files with 66 additions and 544 deletions
--- a/docs/source/en/modular_transformers.md
+++ b/docs/source/en/modular_transformers.md
@@ -243,13 +243,7 @@ class Olmo2Attention(OlmoAttention):

        attention_interface: Callable = eager_attention_forward
        if self.config._attn_implementation != "eager":
-            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
-                logger.warning_once(
-                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
-                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-                )
-            else:
-                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]

        attn_output, attn_weights = attention_interface(
            self,