[RoBERTa-based] Add support for sdpa (#30510)

* Adding SDPA support for RoBERTa-based models

* add not is_cross_attention

* fix copies

* fix test

* add minimal test for camembert and xlm_roberta as their test class does not inherit from ModelTesterMixin

* address some review comments

* use copied from

* style

* consistency

* fix lists

---------

Co-authored-by: fxmarty <9808326+fxmarty@users.noreply.github.com>
Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
This commit is contained in:
JB (Don)
2024-08-28 16:26:00 +08:00
committed by GitHub
parent e0b87b0f40
commit f1a385b1de
11 changed files with 828 additions and 100 deletions

View File

@@ -16,7 +16,14 @@
import unittest
from transformers import is_torch_available
from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
from transformers.testing_utils import (
require_sentencepiece,
require_tokenizers,
require_torch,
require_torch_sdpa,
slow,
torch_device,
)
if is_torch_available():
@@ -31,7 +38,7 @@ if is_torch_available():
class CamembertModelIntegrationTest(unittest.TestCase):
@slow
def test_output_embeds_base_model(self):
model = CamembertModel.from_pretrained("almanach/camembert-base")
model = CamembertModel.from_pretrained("almanach/camembert-base", attn_implementation="eager")
model.to(torch_device)
input_ids = torch.tensor(
@@ -54,3 +61,24 @@ class CamembertModelIntegrationTest(unittest.TestCase):
# expected_slice = roberta.model.forward(input_ids)[0][:, :3, :3].detach()
self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
@slow
@require_torch_sdpa
def test_output_embeds_base_model_sdpa(self):
input_ids = torch.tensor(
[[5, 121, 11, 660, 16, 730, 25543, 110, 83, 6]],
device=torch_device,
dtype=torch.long,
) # J'aime le camembert !
expected_slice = torch.tensor(
[[[-0.0254, 0.0235, 0.1027], [0.0606, -0.1811, -0.0418], [-0.1561, -0.1127, 0.2687]]],
device=torch_device,
dtype=torch.float,
)
model = CamembertModel.from_pretrained("almanach/camembert-base", attn_implementation="sdpa").to(torch_device)
with torch.no_grad():
output = model(input_ids)["last_hidden_state"].detach()
self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))