enable 6 granite cases on xpu (#37569)

* enable 6 granite cases on XPU Signed-off-by: YAO Matrix <matrix.yao@intel.com> * make them all pass on A100 Signed-off-by: N <matrix.yao@intel.com> * fix style Signed-off-by: YAO Matrix <matrix.yao@intel.com> * update --------- Signed-off-by: YAO Matrix <matrix.yao@intel.com> Signed-off-by: N <matrix.yao@intel.com> Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2025-04-22 23:55:02 +08:00
parent 9167461a7d
commit 6673081b21
3 changed files with 104 additions and 24 deletions
--- a/tests/models/granite/test_modeling_granite.py
+++ b/tests/models/granite/test_modeling_granite.py
@@ -19,9 +19,10 @@ from parameterized import parameterized

 from transformers import GraniteConfig, is_torch_available, set_seed
 from transformers.testing_utils import (
+    Expectations,
    require_read_token,
    require_torch,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -302,7 +303,7 @@ class GraniteModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
            torch.testing.assert_close(yarn_sin_long, original_sin_long)


-@require_torch_gpu
+@require_torch_accelerator
 class GraniteIntegrationTest(unittest.TestCase):
    # This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
    # Depending on the hardware we get different logits / generations
@@ -328,15 +329,27 @@ class GraniteIntegrationTest(unittest.TestCase):
        # Expected mean on dim = -1

        # fmt: off
-        EXPECTED_MEAN = torch.tensor([[-1.9798, -3.1626, -2.8062, -2.3777, -2.7091, -2.2338, -2.5924, -2.3974]])
+        EXPECTED_MEANS = Expectations(
+                {
+                    ("xpu", 3): torch.tensor([[-3.1406, -2.5469, -2.6250, -2.1250, -2.6250, -2.6562, -2.6875, -2.9688]]),
+                    ("cuda", 7): torch.tensor([[-1.9798, -3.1626, -2.8062, -2.3777, -2.7091, -2.2338, -2.5924, -2.3974]]),
+                    ("cuda", 8): torch.tensor([[-3.1406, -2.5469, -2.6250, -2.1250, -2.6250, -2.6562, -2.6875, -2.9688]]),
+                }
+            )
+        EXPECTED_MEAN = EXPECTED_MEANS.get_expectation()

-        torch.testing.assert_close(EXPECTED_MEAN.to(torch_device), out.logits.mean(-1), rtol=1e-2, atol=1e-2)
+        torch.testing.assert_close(EXPECTED_MEAN.to(torch_device), out.logits.mean(-1).float(), rtol=1e-2, atol=1e-2)

        # slicing logits[0, 0, 0:15]
-        EXPECTED_SLICE = torch.tensor([[4.8750, -2.1875, -2.1875, -2.1875, -2.1875, -2.8438, -2.1875, -2.1875,
-        -2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875]])
+        EXPECTED_SLICES = Expectations(
+                {
+                    ("xpu", 3): torch.tensor([[2.2031, -5.0625, -5.0625, -5.0625, -5.0625, -0.9180, -5.0625, -5.0625, -5.0625, -5.0625, -5.5312, -2.1719, -1.7891, -0.4922, -2.5469]]),
+                    ("cuda", 7): torch.tensor([[4.8750, -2.1875, -2.1875, -2.1875, -2.1875, -2.8438, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875]]),
+                    ("cuda", 8): torch.tensor([[2.0938, -5.0312, -5.0312, -5.0312, -5.0312, -1.0469, -5.0312, -5.0312, -5.0312, -5.0312, -5.5625, -2.1875, -1.7891, -0.5820, -2.6250]]),
+                }
+            )
+        EXPECTED_SLICE = EXPECTED_SLICES.get_expectation()
        # fmt: on
-
        self.assertTrue(
            torch.allclose(
                EXPECTED_SLICE.to(torch_device),
@@ -358,6 +371,13 @@ class GraniteIntegrationTest(unittest.TestCase):

        # fmt: off
        # Expected mean on dim = -1
-        EXPECTED_MEAN = torch.tensor([[-2.0984, -3.1294, -2.8153, -2.3568, -2.7337, -2.2624, -2.6016, -2.4022]])
+        EXPECTED_MEANS = Expectations(
+                {
+                    ("xpu", 3): torch.tensor([[-3.2693, -2.5957, -2.6234, -2.1675, -2.6386, -2.6850, -2.7039, -2.9656]]),
+                    ("cuda", 7): torch.tensor([[-2.0984, -3.1294, -2.8153, -2.3568, -2.7337, -2.2624, -2.6016, -2.4022]]),
+                    ("cuda", 8): torch.tensor([[-3.2934, -2.6019, -2.6258, -2.1691, -2.6394, -2.6876, -2.7032, -2.9688]]),
+                }
+            )
+        EXPECTED_MEAN = EXPECTED_MEANS.get_expectation()

        torch.testing.assert_close(EXPECTED_MEAN.to(torch_device), out.logits.float().mean(-1), rtol=1e-2, atol=1e-2)
--- a/tests/models/granitemoe/test_modeling_granitemoe.py
+++ b/tests/models/granitemoe/test_modeling_granitemoe.py
@@ -19,9 +19,10 @@ from parameterized import parameterized

 from transformers import AutoTokenizer, GraniteMoeConfig, is_torch_available, set_seed
 from transformers.testing_utils import (
+    Expectations,
    require_read_token,
    require_torch,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -301,7 +302,7 @@ class GraniteMoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
            torch.testing.assert_close(yarn_sin_long, original_sin_long)


-@require_torch_gpu
+@require_torch_accelerator
 class GraniteMoeIntegrationTest(unittest.TestCase):
    # This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
    # Depending on the hardware we get different logits / generations
@@ -325,13 +326,26 @@ class GraniteMoeIntegrationTest(unittest.TestCase):

        # fmt: off
        # Expected mean on dim = -1
-        EXPECTED_MEAN = torch.tensor([[-2.2122, -1.6632, -2.9269, -2.3344, -2.0143, -3.0146, -2.6839, -2.5610]])
+        EXPECTED_MEANS = Expectations(
+                {
+                    ("xpu", 3): torch.tensor([[-4.4005, -3.6689, -3.6187, -2.8308, -3.9871, -3.1001, -2.8738, -2.8063]]),
+                    ("cuda", 7): torch.tensor([[-2.2122, -1.6632, -2.9269, -2.3344, -2.0143, -3.0146, -2.6839, -2.5610]]),
+                    ("cuda", 8): torch.tensor([[-4.4005, -3.6689, -3.6187, -2.8308, -3.9871, -3.1001, -2.8738, -2.8063]]),
+                }
+            )
+        EXPECTED_MEAN = EXPECTED_MEANS.get_expectation()

        torch.testing.assert_close(EXPECTED_MEAN.to(torch_device), out.logits.float().mean(-1), rtol=1e-2, atol=1e-2)

        # slicing logits[0, 0, 0:15]
-        EXPECTED_SLICE = torch.tensor([[4.8785, -2.2890, -2.2892, -2.2885, -2.2890, -3.5007, -2.2897, -2.2892,
-        -2.2895, -2.2891, -2.2887, -2.2882, -2.2889, -2.2898, -2.2892]])
+        EXPECTED_SLICES = Expectations(
+                {
+                    ("xpu", 3): torch.tensor([[2.5479, -9.2123, -9.2121, -9.2175, -9.2122, -1.5024, -9.2121, -9.2122, -9.2161, -9.2122, -6.3100, -3.6223, -3.6377, -5.2542, -5.2523]]),
+                    ("cuda", 7): torch.tensor([[4.8785, -2.2890, -2.2892, -2.2885, -2.2890, -3.5007, -2.2897, -2.2892, -2.2895, -2.2891, -2.2887, -2.2882, -2.2889, -2.2898, -2.2892]]),
+                    ("cuda", 8): torch.tensor([[2.5479, -9.2124, -9.2121, -9.2175, -9.2122, -1.5024, -9.2121, -9.2122, -9.2162, -9.2122, -6.3101, -3.6224, -3.6377, -5.2542, -5.2524]]),
+                }
+            )
+        EXPECTED_SLICE = EXPECTED_SLICES.get_expectation()
        # fmt: on

        self.assertTrue(
@@ -346,10 +360,26 @@ class GraniteMoeIntegrationTest(unittest.TestCase):
    @slow
    def test_model_3b_generation(self):
        # ground truth text generated with dola_layers="low", repetition_penalty=1.2
-        EXPECTED_TEXT_COMPLETION = (
+        EXPECTED_TEXT_COMPLETIONS = Expectations(
+            {
+                ("xpu", 3): (
+                    "Simply put, the theory of relativity states that 1) the speed of light is constant, and 2) the speed of light is the same for all observers.\n\n"
+                    "The first part is easy to understand. The second part is a little more difficult.\n\n"
+                    "The second part of the theory of relativity is a little more difficult to understand.\n"
+                ),
+                ("cuda", 7): (
                    "Simply put, the theory of relativity states that \n$$\n\\frac{d^2x^\\mu}{d\\tau^2} = "
                    "\\frac{1}{c^2}\\frac{d^2x^\\mu}{dt^2}\n$$\nwhere $x^\\mu$ is a four-vector, $\\tau$ is the proper time"
+                ),
+                ("cuda", 8): (
+                    "Simply put, the theory of relativity states that 1) the speed of light is constant, and 2) the speed of light is the same for all observers.\n\n"
+                    "The first part is easy to understand. The second part is a little more difficult.\n\n"
+                    "The second part of the theory of relativity is a little more difficult to understand.\n"
+                ),
+            }
        )
+        EXPECTED_TEXT_COMPLETION = EXPECTED_TEXT_COMPLETIONS.get_expectation()
+
        prompt = "Simply put, the theory of relativity states that "
        tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b")
        model = GraniteMoeForCausalLM.from_pretrained("ibm/PowerMoE-3b", device_map="auto")
--- a/tests/models/granitemoeshared/test_modeling_granitemoeshared.py
+++ b/tests/models/granitemoeshared/test_modeling_granitemoeshared.py
@@ -19,9 +19,10 @@ from parameterized import parameterized

 from transformers import AutoTokenizer, GraniteMoeSharedConfig, is_torch_available, set_seed
 from transformers.testing_utils import (
+    Expectations,
    require_read_token,
    require_torch,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -304,7 +305,7 @@ class GraniteMoeSharedModelTest(ModelTesterMixin, GenerationTesterMixin, unittes
            torch.testing.assert_close(yarn_sin_long, original_sin_long)


-@require_torch_gpu
+@require_torch_accelerator
 class GraniteMoeSharedIntegrationTest(unittest.TestCase):
    # This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
    # Depending on the hardware we get different logits / generations
@@ -328,13 +329,26 @@ class GraniteMoeSharedIntegrationTest(unittest.TestCase):

        # fmt: off
        # Expected mean on dim = -1
-        EXPECTED_MEAN = torch.tensor([[-2.2122, -1.6632, -2.9269, -2.3344, -2.0143, -3.0146, -2.6839, -2.5610]])
+        EXPECTED_MEANS = Expectations(
+                {
+                    ("xpu", 3): torch.tensor([[-4.4005, -3.6689, -3.6187, -2.8308, -3.9871, -3.1001, -2.8738, -2.8063]]),
+                    ("cuda", 7): torch.tensor([[-2.2122, -1.6632, -2.9269, -2.3344, -2.0143, -3.0146, -2.6839, -2.5610]]),
+                    ("cuda", 8): torch.tensor([[-4.4005, -3.6689, -3.6187, -2.8308, -3.9871, -3.1001, -2.8738, -2.8063]]),
+                }
+            )

+        EXPECTED_MEAN = EXPECTED_MEANS.get_expectation()
        torch.testing.assert_close(EXPECTED_MEAN.to(torch_device), out.logits.float().mean(-1), rtol=1e-2, atol=1e-2)

        # slicing logits[0, 0, 0:15]
-        EXPECTED_SLICE = torch.tensor([[4.8785, -2.2890, -2.2892, -2.2885, -2.2890, -3.5007, -2.2897, -2.2892,
-        -2.2895, -2.2891, -2.2887, -2.2882, -2.2889, -2.2898, -2.2892]])
+        EXPECTED_SLICES = Expectations(
+                {
+                    ("xpu", 3): torch.tensor([[2.5479, -9.2123, -9.2121, -9.2175, -9.2122, -1.5024, -9.2121, -9.2122, -9.2161, -9.2122, -6.3100, -3.6223, -3.6377, -5.2542, -5.2523]]),
+                    ("cuda", 7): torch.tensor([[4.8785, -2.2890, -2.2892, -2.2885, -2.2890, -3.5007, -2.2897, -2.2892, -2.2895, -2.2891, -2.2887, -2.2882, -2.2889, -2.2898, -2.2892]]),
+                    ("cuda", 8): torch.tensor([[2.5479, -9.2123, -9.2121, -9.2175, -9.2122, -1.5024, -9.2121, -9.2122, -9.2161, -9.2122, -6.3100, -3.6223, -3.6377, -5.2542, -5.2523]]),
+                }
+            )
+        EXPECTED_SLICE = EXPECTED_SLICES.get_expectation()
        # fmt: on

        self.assertTrue(
@@ -349,10 +363,26 @@ class GraniteMoeSharedIntegrationTest(unittest.TestCase):
    @slow
    def test_model_3b_generation(self):
        # ground truth text generated with dola_layers="low", repetition_penalty=1.2
-        EXPECTED_TEXT_COMPLETION = (
+        EXPECTED_TEXT_COMPLETIONS = Expectations(
+            {
+                ("xpu", 3): (
+                    "Simply put, the theory of relativity states that 1) the speed of light is constant, and 2) the speed of light is the same for all observers.\n\n"
+                    "The first part is easy to understand. The second part is a little more difficult.\n\n"
+                    "The second part of the theory of relativity is a little more difficult to understand.\n"
+                ),
+                ("cuda", 7): (
                    "Simply put, the theory of relativity states that \n$$\n\\frac{d^2x^\\mu}{d\\tau^2} = "
                    "\\frac{1}{c^2}\\frac{d^2x^\\mu}{dt^2}\n$$\nwhere $x^\\mu$ is a four-vector, $\\tau$ is the proper time"
+                ),
+                ("cuda", 8): (
+                    "Simply put, the theory of relativity states that 1) the speed of light is constant, and 2) the speed of light is the same for all observers.\n\n"
+                    "The first part is easy to understand. The second part is a little more difficult.\n\n"
+                    "The second part of the theory of relativity is a little more difficult to understand.\n"
+                ),
+            }
        )
+        EXPECTED_TEXT_COMPLETION = EXPECTED_TEXT_COMPLETIONS.get_expectation()
+
        prompt = "Simply put, the theory of relativity states that "
        tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b")
        model = GraniteMoeSharedForCausalLM.from_pretrained("ibm/PowerMoE-3b", device_map="auto")