From 6673081b215d6c2576b416efb62201bdf81d1ed9 Mon Sep 17 00:00:00 2001 From: Yao Matrix Date: Tue, 22 Apr 2025 23:55:02 +0800 Subject: [PATCH] enable 6 granite cases on xpu (#37569) * enable 6 granite cases on XPU Signed-off-by: YAO Matrix * make them all pass on A100 Signed-off-by: N * fix style Signed-off-by: YAO Matrix * update --------- Signed-off-by: YAO Matrix Signed-off-by: N Co-authored-by: ydshieh --- tests/models/granite/test_modeling_granite.py | 36 +++++++++++---- .../granitemoe/test_modeling_granitemoe.py | 46 +++++++++++++++---- .../test_modeling_granitemoeshared.py | 46 +++++++++++++++---- 3 files changed, 104 insertions(+), 24 deletions(-) diff --git a/tests/models/granite/test_modeling_granite.py b/tests/models/granite/test_modeling_granite.py index d0ea3c35a9..be1b5841ff 100644 --- a/tests/models/granite/test_modeling_granite.py +++ b/tests/models/granite/test_modeling_granite.py @@ -19,9 +19,10 @@ from parameterized import parameterized from transformers import GraniteConfig, is_torch_available, set_seed from transformers.testing_utils import ( + Expectations, require_read_token, require_torch, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -302,7 +303,7 @@ class GraniteModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi torch.testing.assert_close(yarn_sin_long, original_sin_long) -@require_torch_gpu +@require_torch_accelerator class GraniteIntegrationTest(unittest.TestCase): # This variable is used to determine which CUDA device are we using for our runners (A10 or T4) # Depending on the hardware we get different logits / generations @@ -328,15 +329,27 @@ class GraniteIntegrationTest(unittest.TestCase): # Expected mean on dim = -1 # fmt: off - EXPECTED_MEAN = torch.tensor([[-1.9798, -3.1626, -2.8062, -2.3777, -2.7091, -2.2338, -2.5924, -2.3974]]) + EXPECTED_MEANS = Expectations( + { + ("xpu", 3): torch.tensor([[-3.1406, -2.5469, -2.6250, -2.1250, -2.6250, -2.6562, -2.6875, -2.9688]]), + ("cuda", 7): torch.tensor([[-1.9798, -3.1626, -2.8062, -2.3777, -2.7091, -2.2338, -2.5924, -2.3974]]), + ("cuda", 8): torch.tensor([[-3.1406, -2.5469, -2.6250, -2.1250, -2.6250, -2.6562, -2.6875, -2.9688]]), + } + ) + EXPECTED_MEAN = EXPECTED_MEANS.get_expectation() - torch.testing.assert_close(EXPECTED_MEAN.to(torch_device), out.logits.mean(-1), rtol=1e-2, atol=1e-2) + torch.testing.assert_close(EXPECTED_MEAN.to(torch_device), out.logits.mean(-1).float(), rtol=1e-2, atol=1e-2) # slicing logits[0, 0, 0:15] - EXPECTED_SLICE = torch.tensor([[4.8750, -2.1875, -2.1875, -2.1875, -2.1875, -2.8438, -2.1875, -2.1875, - -2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875]]) + EXPECTED_SLICES = Expectations( + { + ("xpu", 3): torch.tensor([[2.2031, -5.0625, -5.0625, -5.0625, -5.0625, -0.9180, -5.0625, -5.0625, -5.0625, -5.0625, -5.5312, -2.1719, -1.7891, -0.4922, -2.5469]]), + ("cuda", 7): torch.tensor([[4.8750, -2.1875, -2.1875, -2.1875, -2.1875, -2.8438, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875]]), + ("cuda", 8): torch.tensor([[2.0938, -5.0312, -5.0312, -5.0312, -5.0312, -1.0469, -5.0312, -5.0312, -5.0312, -5.0312, -5.5625, -2.1875, -1.7891, -0.5820, -2.6250]]), + } + ) + EXPECTED_SLICE = EXPECTED_SLICES.get_expectation() # fmt: on - self.assertTrue( torch.allclose( EXPECTED_SLICE.to(torch_device), @@ -358,6 +371,13 @@ class GraniteIntegrationTest(unittest.TestCase): # fmt: off # Expected mean on dim = -1 - EXPECTED_MEAN = torch.tensor([[-2.0984, -3.1294, -2.8153, -2.3568, -2.7337, -2.2624, -2.6016, -2.4022]]) + EXPECTED_MEANS = Expectations( + { + ("xpu", 3): torch.tensor([[-3.2693, -2.5957, -2.6234, -2.1675, -2.6386, -2.6850, -2.7039, -2.9656]]), + ("cuda", 7): torch.tensor([[-2.0984, -3.1294, -2.8153, -2.3568, -2.7337, -2.2624, -2.6016, -2.4022]]), + ("cuda", 8): torch.tensor([[-3.2934, -2.6019, -2.6258, -2.1691, -2.6394, -2.6876, -2.7032, -2.9688]]), + } + ) + EXPECTED_MEAN = EXPECTED_MEANS.get_expectation() torch.testing.assert_close(EXPECTED_MEAN.to(torch_device), out.logits.float().mean(-1), rtol=1e-2, atol=1e-2) diff --git a/tests/models/granitemoe/test_modeling_granitemoe.py b/tests/models/granitemoe/test_modeling_granitemoe.py index db43e10de1..e451ff30c8 100644 --- a/tests/models/granitemoe/test_modeling_granitemoe.py +++ b/tests/models/granitemoe/test_modeling_granitemoe.py @@ -19,9 +19,10 @@ from parameterized import parameterized from transformers import AutoTokenizer, GraniteMoeConfig, is_torch_available, set_seed from transformers.testing_utils import ( + Expectations, require_read_token, require_torch, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -301,7 +302,7 @@ class GraniteMoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test torch.testing.assert_close(yarn_sin_long, original_sin_long) -@require_torch_gpu +@require_torch_accelerator class GraniteMoeIntegrationTest(unittest.TestCase): # This variable is used to determine which CUDA device are we using for our runners (A10 or T4) # Depending on the hardware we get different logits / generations @@ -325,13 +326,26 @@ class GraniteMoeIntegrationTest(unittest.TestCase): # fmt: off # Expected mean on dim = -1 - EXPECTED_MEAN = torch.tensor([[-2.2122, -1.6632, -2.9269, -2.3344, -2.0143, -3.0146, -2.6839, -2.5610]]) + EXPECTED_MEANS = Expectations( + { + ("xpu", 3): torch.tensor([[-4.4005, -3.6689, -3.6187, -2.8308, -3.9871, -3.1001, -2.8738, -2.8063]]), + ("cuda", 7): torch.tensor([[-2.2122, -1.6632, -2.9269, -2.3344, -2.0143, -3.0146, -2.6839, -2.5610]]), + ("cuda", 8): torch.tensor([[-4.4005, -3.6689, -3.6187, -2.8308, -3.9871, -3.1001, -2.8738, -2.8063]]), + } + ) + EXPECTED_MEAN = EXPECTED_MEANS.get_expectation() torch.testing.assert_close(EXPECTED_MEAN.to(torch_device), out.logits.float().mean(-1), rtol=1e-2, atol=1e-2) # slicing logits[0, 0, 0:15] - EXPECTED_SLICE = torch.tensor([[4.8785, -2.2890, -2.2892, -2.2885, -2.2890, -3.5007, -2.2897, -2.2892, - -2.2895, -2.2891, -2.2887, -2.2882, -2.2889, -2.2898, -2.2892]]) + EXPECTED_SLICES = Expectations( + { + ("xpu", 3): torch.tensor([[2.5479, -9.2123, -9.2121, -9.2175, -9.2122, -1.5024, -9.2121, -9.2122, -9.2161, -9.2122, -6.3100, -3.6223, -3.6377, -5.2542, -5.2523]]), + ("cuda", 7): torch.tensor([[4.8785, -2.2890, -2.2892, -2.2885, -2.2890, -3.5007, -2.2897, -2.2892, -2.2895, -2.2891, -2.2887, -2.2882, -2.2889, -2.2898, -2.2892]]), + ("cuda", 8): torch.tensor([[2.5479, -9.2124, -9.2121, -9.2175, -9.2122, -1.5024, -9.2121, -9.2122, -9.2162, -9.2122, -6.3101, -3.6224, -3.6377, -5.2542, -5.2524]]), + } + ) + EXPECTED_SLICE = EXPECTED_SLICES.get_expectation() # fmt: on self.assertTrue( @@ -346,10 +360,26 @@ class GraniteMoeIntegrationTest(unittest.TestCase): @slow def test_model_3b_generation(self): # ground truth text generated with dola_layers="low", repetition_penalty=1.2 - EXPECTED_TEXT_COMPLETION = ( - "Simply put, the theory of relativity states that \n$$\n\\frac{d^2x^\\mu}{d\\tau^2} = " - "\\frac{1}{c^2}\\frac{d^2x^\\mu}{dt^2}\n$$\nwhere $x^\\mu$ is a four-vector, $\\tau$ is the proper time" + EXPECTED_TEXT_COMPLETIONS = Expectations( + { + ("xpu", 3): ( + "Simply put, the theory of relativity states that 1) the speed of light is constant, and 2) the speed of light is the same for all observers.\n\n" + "The first part is easy to understand. The second part is a little more difficult.\n\n" + "The second part of the theory of relativity is a little more difficult to understand.\n" + ), + ("cuda", 7): ( + "Simply put, the theory of relativity states that \n$$\n\\frac{d^2x^\\mu}{d\\tau^2} = " + "\\frac{1}{c^2}\\frac{d^2x^\\mu}{dt^2}\n$$\nwhere $x^\\mu$ is a four-vector, $\\tau$ is the proper time" + ), + ("cuda", 8): ( + "Simply put, the theory of relativity states that 1) the speed of light is constant, and 2) the speed of light is the same for all observers.\n\n" + "The first part is easy to understand. The second part is a little more difficult.\n\n" + "The second part of the theory of relativity is a little more difficult to understand.\n" + ), + } ) + EXPECTED_TEXT_COMPLETION = EXPECTED_TEXT_COMPLETIONS.get_expectation() + prompt = "Simply put, the theory of relativity states that " tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b") model = GraniteMoeForCausalLM.from_pretrained("ibm/PowerMoE-3b", device_map="auto") diff --git a/tests/models/granitemoeshared/test_modeling_granitemoeshared.py b/tests/models/granitemoeshared/test_modeling_granitemoeshared.py index 0ac6327448..5de3552c20 100644 --- a/tests/models/granitemoeshared/test_modeling_granitemoeshared.py +++ b/tests/models/granitemoeshared/test_modeling_granitemoeshared.py @@ -19,9 +19,10 @@ from parameterized import parameterized from transformers import AutoTokenizer, GraniteMoeSharedConfig, is_torch_available, set_seed from transformers.testing_utils import ( + Expectations, require_read_token, require_torch, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -304,7 +305,7 @@ class GraniteMoeSharedModelTest(ModelTesterMixin, GenerationTesterMixin, unittes torch.testing.assert_close(yarn_sin_long, original_sin_long) -@require_torch_gpu +@require_torch_accelerator class GraniteMoeSharedIntegrationTest(unittest.TestCase): # This variable is used to determine which CUDA device are we using for our runners (A10 or T4) # Depending on the hardware we get different logits / generations @@ -328,13 +329,26 @@ class GraniteMoeSharedIntegrationTest(unittest.TestCase): # fmt: off # Expected mean on dim = -1 - EXPECTED_MEAN = torch.tensor([[-2.2122, -1.6632, -2.9269, -2.3344, -2.0143, -3.0146, -2.6839, -2.5610]]) + EXPECTED_MEANS = Expectations( + { + ("xpu", 3): torch.tensor([[-4.4005, -3.6689, -3.6187, -2.8308, -3.9871, -3.1001, -2.8738, -2.8063]]), + ("cuda", 7): torch.tensor([[-2.2122, -1.6632, -2.9269, -2.3344, -2.0143, -3.0146, -2.6839, -2.5610]]), + ("cuda", 8): torch.tensor([[-4.4005, -3.6689, -3.6187, -2.8308, -3.9871, -3.1001, -2.8738, -2.8063]]), + } + ) + EXPECTED_MEAN = EXPECTED_MEANS.get_expectation() torch.testing.assert_close(EXPECTED_MEAN.to(torch_device), out.logits.float().mean(-1), rtol=1e-2, atol=1e-2) # slicing logits[0, 0, 0:15] - EXPECTED_SLICE = torch.tensor([[4.8785, -2.2890, -2.2892, -2.2885, -2.2890, -3.5007, -2.2897, -2.2892, - -2.2895, -2.2891, -2.2887, -2.2882, -2.2889, -2.2898, -2.2892]]) + EXPECTED_SLICES = Expectations( + { + ("xpu", 3): torch.tensor([[2.5479, -9.2123, -9.2121, -9.2175, -9.2122, -1.5024, -9.2121, -9.2122, -9.2161, -9.2122, -6.3100, -3.6223, -3.6377, -5.2542, -5.2523]]), + ("cuda", 7): torch.tensor([[4.8785, -2.2890, -2.2892, -2.2885, -2.2890, -3.5007, -2.2897, -2.2892, -2.2895, -2.2891, -2.2887, -2.2882, -2.2889, -2.2898, -2.2892]]), + ("cuda", 8): torch.tensor([[2.5479, -9.2123, -9.2121, -9.2175, -9.2122, -1.5024, -9.2121, -9.2122, -9.2161, -9.2122, -6.3100, -3.6223, -3.6377, -5.2542, -5.2523]]), + } + ) + EXPECTED_SLICE = EXPECTED_SLICES.get_expectation() # fmt: on self.assertTrue( @@ -349,10 +363,26 @@ class GraniteMoeSharedIntegrationTest(unittest.TestCase): @slow def test_model_3b_generation(self): # ground truth text generated with dola_layers="low", repetition_penalty=1.2 - EXPECTED_TEXT_COMPLETION = ( - "Simply put, the theory of relativity states that \n$$\n\\frac{d^2x^\\mu}{d\\tau^2} = " - "\\frac{1}{c^2}\\frac{d^2x^\\mu}{dt^2}\n$$\nwhere $x^\\mu$ is a four-vector, $\\tau$ is the proper time" + EXPECTED_TEXT_COMPLETIONS = Expectations( + { + ("xpu", 3): ( + "Simply put, the theory of relativity states that 1) the speed of light is constant, and 2) the speed of light is the same for all observers.\n\n" + "The first part is easy to understand. The second part is a little more difficult.\n\n" + "The second part of the theory of relativity is a little more difficult to understand.\n" + ), + ("cuda", 7): ( + "Simply put, the theory of relativity states that \n$$\n\\frac{d^2x^\\mu}{d\\tau^2} = " + "\\frac{1}{c^2}\\frac{d^2x^\\mu}{dt^2}\n$$\nwhere $x^\\mu$ is a four-vector, $\\tau$ is the proper time" + ), + ("cuda", 8): ( + "Simply put, the theory of relativity states that 1) the speed of light is constant, and 2) the speed of light is the same for all observers.\n\n" + "The first part is easy to understand. The second part is a little more difficult.\n\n" + "The second part of the theory of relativity is a little more difficult to understand.\n" + ), + } ) + EXPECTED_TEXT_COMPLETION = EXPECTED_TEXT_COMPLETIONS.get_expectation() + prompt = "Simply put, the theory of relativity states that " tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b") model = GraniteMoeSharedForCausalLM.from_pretrained("ibm/PowerMoE-3b", device_map="auto")