From a1b82563f11d9101d54b06fd61aef8c90f63c9d2 Mon Sep 17 00:00:00 2001
From: Yao Matrix <matrix.yao@intel.com>
Date: Fri, 18 Apr 2025 18:28:08 +0800
Subject: [PATCH] enable 6 modeling cases on XPU (#37571)

Signed-off-by: YAO Matrix <matrix.yao@intel.com>
---
 tests/models/bamba/test_modeling_bamba.py     | 49 +++++++++++++------
 tests/models/gemma/test_modeling_gemma.py     |  2 +-
 tests/models/mpt/test_modeling_mpt.py         |  8 ++-
 .../models/nemotron/test_modeling_nemotron.py | 17 +++++--
 4 files changed, 55 insertions(+), 21 deletions(-)

diff --git a/tests/models/bamba/test_modeling_bamba.py b/tests/models/bamba/test_modeling_bamba.py
index c0703f8490..2159f427bd 100644
--- a/tests/models/bamba/test_modeling_bamba.py
+++ b/tests/models/bamba/test_modeling_bamba.py
@@ -19,7 +19,14 @@ import unittest
 import pytest
 
 from transformers import AutoTokenizer, BambaConfig, is_torch_available
-from transformers.testing_utils import Expectations, require_torch, require_torch_gpu, slow, torch_device
+from transformers.testing_utils import (
+    Expectations,
+    require_deterministic_for_xpu,
+    require_torch,
+    require_torch_accelerator,
+    slow,
+    torch_device,
+)
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -474,7 +481,7 @@ class BambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
 
 @slow
 @require_torch
-@require_torch_gpu
+@require_torch_accelerator
 class BambaModelIntegrationTest(unittest.TestCase):
     model = None
     tokenizer = None
@@ -507,6 +514,10 @@ class BambaModelIntegrationTest(unittest.TestCase):
                     "rocm",
                     9,
                 ): "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are doing well. I am here",
+                (
+                    "xpu",
+                    3,
+                ): "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are all doing well. Today I",
             }
         )
 
@@ -536,22 +547,30 @@ class BambaModelIntegrationTest(unittest.TestCase):
 
             torch.testing.assert_close(logits[0, -1, :40].cpu(), EXPECTED_LOGITS_NO_GRAD, rtol=1e-3, atol=1)
 
+    @require_deterministic_for_xpu
     def test_simple_batched_generate_with_padding(self):
         # Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
         #
         # Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
         # considering differences in hardware processing and potential deviations in generated text.
-        EXPECTED_TEXTS = {
-            7: [],
-            8: [
-                "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are doing well. I am here",
-                "!!!<|begin_of_text|>I am late! I need to get to work! I have to get to the",
-            ],
-            9: [
-                "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are doing well. I am here",
-                "!!!<|begin_of_text|>I am late! I need to be at the airport in 20 minutes! I",
-            ],
-        }
+        EXPECTED_TEXTS = Expectations(
+            {
+                ("cuda", 7): [],
+                ("cuda", 8): [
+                    "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are doing well. I am here",
+                    "!!!<|begin_of_text|>I am late! I need to get to work! I have to get to the",
+                ],
+                ("rocm", 9): [
+                    "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are doing well. I am here",
+                    "!!!<|begin_of_text|>I am late! I need to be at the airport in 20 minutes! I",
+                ],
+                ("xpu", 3): [
+                    "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are all doing well. Today I",
+                    "!!!<|begin_of_text|>I am late! I need to get to work! I have to get to the",
+                ],
+            }
+        )
+        EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
 
         self.model.to(torch_device)
 
@@ -562,8 +581,8 @@ class BambaModelIntegrationTest(unittest.TestCase):
         ).to(torch_device)
         out = self.model.generate(**inputs, do_sample=False, max_new_tokens=10)
         output_sentences = self.tokenizer.batch_decode(out)
-        self.assertEqual(output_sentences[0], EXPECTED_TEXTS[self.cuda_compute_capability_major_version][0])
-        self.assertEqual(output_sentences[1], EXPECTED_TEXTS[self.cuda_compute_capability_major_version][1])
+        self.assertEqual(output_sentences[0], EXPECTED_TEXT[0])
+        self.assertEqual(output_sentences[1], EXPECTED_TEXT[1])
 
         # TODO: there are significant differences in the logits across major cuda versions, which shouldn't exist
         if self.cuda_compute_capability_major_version == 8:
diff --git a/tests/models/gemma/test_modeling_gemma.py b/tests/models/gemma/test_modeling_gemma.py
index 5e45e78a44..4b7293817a 100644
--- a/tests/models/gemma/test_modeling_gemma.py
+++ b/tests/models/gemma/test_modeling_gemma.py
@@ -643,7 +643,7 @@ class GemmaIntegrationTest(unittest.TestCase):
         self.assertEqual(output_text, EXPECTED_TEXTS)
 
     @slow
-    @require_torch_gpu
+    @require_torch_accelerator
     @require_read_token
     def test_compile_static_cache(self):
         # `torch==2.2` will throw an error on this test (as in other compilation tests), but torch==2.1.2 and torch>2.2
diff --git a/tests/models/mpt/test_modeling_mpt.py b/tests/models/mpt/test_modeling_mpt.py
index 458e458dd2..844ecfd46e 100644
--- a/tests/models/mpt/test_modeling_mpt.py
+++ b/tests/models/mpt/test_modeling_mpt.py
@@ -520,7 +520,13 @@ class MptIntegrationTests(unittest.TestCase):
 
         outputs = model(dummy_input, output_hidden_states=True)
 
-        expected_slice = torch.Tensor([-0.2520, -0.2178, -0.1953]).to(torch_device, torch.bfloat16)
+        expected_slices = Expectations(
+            {
+                ("xpu", 3): torch.Tensor([-0.2090, -0.2061, -0.1465]),
+                ("cuda", 7): torch.Tensor([-0.2520, -0.2178, -0.1953]),
+            }
+        )
+        expected_slice = expected_slices.get_expectation().to(torch_device, torch.bfloat16)
         predicted_slice = outputs.hidden_states[-1][0, 0, :3]
 
         torch.testing.assert_close(expected_slice, predicted_slice, rtol=1e-3, atol=1e-3)
diff --git a/tests/models/nemotron/test_modeling_nemotron.py b/tests/models/nemotron/test_modeling_nemotron.py
index 58c36b615f..d573537b6b 100644
--- a/tests/models/nemotron/test_modeling_nemotron.py
+++ b/tests/models/nemotron/test_modeling_nemotron.py
@@ -21,6 +21,7 @@ import pytest
 
 from transformers import NemotronConfig, is_torch_available
 from transformers.testing_utils import (
+    Expectations,
     is_flaky,
     require_flash_attn,
     require_read_token,
@@ -168,7 +169,7 @@ class NemotronModelTest(GemmaModelTest):
                 assert torch.allclose(logits_fa, logits, atol=1e-2)
 
 
-@require_torch_gpu
+@require_torch_accelerator
 class NemotronIntegrationTest(unittest.TestCase):
     # This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
     # Depending on the hardware we get different logits / generations
@@ -202,9 +203,17 @@ class NemotronIntegrationTest(unittest.TestCase):
     @require_read_token
     def test_nemotron_8b_generation_eager(self):
         text = ["What is the largest planet in solar system?"]
-        EXPECTED_TEXT = [
-            "What is the largest planet in solar system?\nAnswer: Jupiter\n\nWhat is the answer",
-        ]
+        EXPECTED_TEXTS = Expectations(
+            {
+                ("xpu", 3): [
+                    "What is the largest planet in solar system?\nAnswer: Jupiter\n\nWhat is the answer: What is the name of the 19",
+                ],
+                ("cuda", 7): [
+                    "What is the largest planet in solar system?\nAnswer: Jupiter\n\nWhat is the answer",
+                ],
+            }
+        )
+        EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
         model_id = "thhaus/nemotron3-8b"
         model = NemotronForCausalLM.from_pretrained(
             model_id, torch_dtype=torch.float16, device_map="auto", attn_implementation="eager"