From 1897874edcba58e0e585305ca71f6ba727315219 Mon Sep 17 00:00:00 2001
From: fxmarty <9808326+fxmarty@users.noreply.github.com>
Date: Mon, 8 Apr 2024 16:25:07 +0200
Subject: [PATCH] Fix falcon with SDPA, alibi but no passed mask (#30123)

* fix falcon without attention_mask & alibi

* add test

* Update tests/models/falcon/test_modeling_falcon.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/falcon/modeling_falcon.py          | 24 ++++++++-----------
 tests/models/falcon/test_modeling_falcon.py   | 24 +++++++++++++++++++
 2 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py
index f1cff3f181..c2f3f91d5c 100644
--- a/src/transformers/models/falcon/modeling_falcon.py
+++ b/src/transformers/models/falcon/modeling_falcon.py
@@ -1098,27 +1098,23 @@ class FalconModel(FalconPreTrainedModel):
             elif head_mask is None:
                 alibi = alibi.reshape(batch_size, -1, *alibi.shape[1:])
 
-                attention_mask_2d = attention_mask
                 # We don't call _prepare_4d_causal_attention_mask_for_sdpa as we need to mask alibi using the 4D attention_mask untouched.
                 attention_mask = _prepare_4d_causal_attention_mask(
                     attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
                 )
 
                 # We take care to integrate alibi bias in the attention_mask here.
-                if attention_mask_2d is None:
-                    attention_mask = alibi / math.sqrt(self.config.hidden_size // self.num_heads)
-                else:
-                    min_dtype = torch.finfo(alibi.dtype).min
-                    attention_mask = torch.masked_fill(
-                        alibi / math.sqrt(self.config.hidden_size // self.num_heads),
-                        attention_mask < -1,
-                        min_dtype,
-                    )
+                min_dtype = torch.finfo(alibi.dtype).min
+                attention_mask = torch.masked_fill(
+                    alibi / math.sqrt(self.config.hidden_size // self.num_heads),
+                    attention_mask < -1,
+                    min_dtype,
+                )
 
-                    # From PyTorch 2.1 onwards, F.scaled_dot_product_attention with the memory-efficient attention backend
-                    # produces nans if sequences are completely unattended in the attention mask. Details: https://github.com/pytorch/pytorch/issues/110213
-                    if seq_length > 1 and attention_mask.device.type == "cuda":
-                        attention_mask = AttentionMaskConverter._unmask_unattended(attention_mask, min_dtype=min_dtype)
+                # From PyTorch 2.1 onwards, F.scaled_dot_product_attention with the memory-efficient attention backend
+                # produces nans if sequences are completely unattended in the attention mask. Details: https://github.com/pytorch/pytorch/issues/110213
+                if seq_length > 1 and attention_mask.device.type == "cuda":
+                    attention_mask = AttentionMaskConverter._unmask_unattended(attention_mask, min_dtype=min_dtype)
             else:
                 # PyTorch SDPA does not support head_mask, we fall back on the eager implementation in this case.
                 attention_mask = _prepare_4d_causal_attention_mask(
diff --git a/tests/models/falcon/test_modeling_falcon.py b/tests/models/falcon/test_modeling_falcon.py
index 35708d868d..c8ad2d9b32 100644
--- a/tests/models/falcon/test_modeling_falcon.py
+++ b/tests/models/falcon/test_modeling_falcon.py
@@ -666,3 +666,27 @@ class FalconLanguageGenerationTest(unittest.TestCase):
         self.assertLess(unpadded_inputs.input_ids.shape[-1], padded_inputs.input_ids.shape[-1])  # left-padding exists
         self.assertEqual(unpadded_gen_text[0], expected_output)
         self.assertEqual(padded_gen_text[0], expected_output)
+
+    @slow
+    @require_torch_sdpa
+    def test_falcon_alibi_sdpa_matches_eager(self):
+        input_ids = torch.randint(0, 1000, (5, 20))
+
+        config = FalconConfig(
+            vocab_size=1000,
+            hidden_size=64,
+            num_hidden_layers=3,
+            num_attention_heads=4,
+            new_decoder_architecture=True,
+            alibi=True,
+        )
+
+        falcon = FalconForCausalLM(config)
+        falcon = falcon.eval()
+
+        with torch.no_grad():
+            # output_attentions=True dispatches to eager path
+            falcon_output_eager = falcon(input_ids, output_attentions=True)[0]
+            falcon_output_sdpa = falcon(input_ids)[0]
+
+        self.assertTrue(torch.allclose(falcon_output_eager, falcon_output_sdpa, atol=1e-3))