enable 6 granite cases on xpu (#37569)

* enable 6 granite cases on XPU

Signed-off-by: YAO Matrix <matrix.yao@intel.com>

* make them all pass on A100

Signed-off-by: N <matrix.yao@intel.com>

* fix style

Signed-off-by: YAO Matrix <matrix.yao@intel.com>

* update

---------

Signed-off-by: YAO Matrix <matrix.yao@intel.com>
Signed-off-by: N <matrix.yao@intel.com>
Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
Yao Matrix
2025-04-22 23:55:02 +08:00
committed by GitHub
parent 9167461a7d
commit 6673081b21
3 changed files with 104 additions and 24 deletions

View File

@@ -19,9 +19,10 @@ from parameterized import parameterized
from transformers import GraniteConfig, is_torch_available, set_seed
from transformers.testing_utils import (
Expectations,
require_read_token,
require_torch,
require_torch_gpu,
require_torch_accelerator,
slow,
torch_device,
)
@@ -302,7 +303,7 @@ class GraniteModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
torch.testing.assert_close(yarn_sin_long, original_sin_long)
@require_torch_gpu
@require_torch_accelerator
class GraniteIntegrationTest(unittest.TestCase):
# This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
# Depending on the hardware we get different logits / generations
@@ -328,15 +329,27 @@ class GraniteIntegrationTest(unittest.TestCase):
# Expected mean on dim = -1
# fmt: off
EXPECTED_MEAN = torch.tensor([[-1.9798, -3.1626, -2.8062, -2.3777, -2.7091, -2.2338, -2.5924, -2.3974]])
EXPECTED_MEANS = Expectations(
{
("xpu", 3): torch.tensor([[-3.1406, -2.5469, -2.6250, -2.1250, -2.6250, -2.6562, -2.6875, -2.9688]]),
("cuda", 7): torch.tensor([[-1.9798, -3.1626, -2.8062, -2.3777, -2.7091, -2.2338, -2.5924, -2.3974]]),
("cuda", 8): torch.tensor([[-3.1406, -2.5469, -2.6250, -2.1250, -2.6250, -2.6562, -2.6875, -2.9688]]),
}
)
EXPECTED_MEAN = EXPECTED_MEANS.get_expectation()
torch.testing.assert_close(EXPECTED_MEAN.to(torch_device), out.logits.mean(-1), rtol=1e-2, atol=1e-2)
torch.testing.assert_close(EXPECTED_MEAN.to(torch_device), out.logits.mean(-1).float(), rtol=1e-2, atol=1e-2)
# slicing logits[0, 0, 0:15]
EXPECTED_SLICE = torch.tensor([[4.8750, -2.1875, -2.1875, -2.1875, -2.1875, -2.8438, -2.1875, -2.1875,
-2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875]])
EXPECTED_SLICES = Expectations(
{
("xpu", 3): torch.tensor([[2.2031, -5.0625, -5.0625, -5.0625, -5.0625, -0.9180, -5.0625, -5.0625, -5.0625, -5.0625, -5.5312, -2.1719, -1.7891, -0.4922, -2.5469]]),
("cuda", 7): torch.tensor([[4.8750, -2.1875, -2.1875, -2.1875, -2.1875, -2.8438, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875]]),
("cuda", 8): torch.tensor([[2.0938, -5.0312, -5.0312, -5.0312, -5.0312, -1.0469, -5.0312, -5.0312, -5.0312, -5.0312, -5.5625, -2.1875, -1.7891, -0.5820, -2.6250]]),
}
)
EXPECTED_SLICE = EXPECTED_SLICES.get_expectation()
# fmt: on
self.assertTrue(
torch.allclose(
EXPECTED_SLICE.to(torch_device),
@@ -358,6 +371,13 @@ class GraniteIntegrationTest(unittest.TestCase):
# fmt: off
# Expected mean on dim = -1
EXPECTED_MEAN = torch.tensor([[-2.0984, -3.1294, -2.8153, -2.3568, -2.7337, -2.2624, -2.6016, -2.4022]])
EXPECTED_MEANS = Expectations(
{
("xpu", 3): torch.tensor([[-3.2693, -2.5957, -2.6234, -2.1675, -2.6386, -2.6850, -2.7039, -2.9656]]),
("cuda", 7): torch.tensor([[-2.0984, -3.1294, -2.8153, -2.3568, -2.7337, -2.2624, -2.6016, -2.4022]]),
("cuda", 8): torch.tensor([[-3.2934, -2.6019, -2.6258, -2.1691, -2.6394, -2.6876, -2.7032, -2.9688]]),
}
)
EXPECTED_MEAN = EXPECTED_MEANS.get_expectation()
torch.testing.assert_close(EXPECTED_MEAN.to(torch_device), out.logits.float().mean(-1), rtol=1e-2, atol=1e-2)

View File

@@ -19,9 +19,10 @@ from parameterized import parameterized
from transformers import AutoTokenizer, GraniteMoeConfig, is_torch_available, set_seed
from transformers.testing_utils import (
Expectations,
require_read_token,
require_torch,
require_torch_gpu,
require_torch_accelerator,
slow,
torch_device,
)
@@ -301,7 +302,7 @@ class GraniteMoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
torch.testing.assert_close(yarn_sin_long, original_sin_long)
@require_torch_gpu
@require_torch_accelerator
class GraniteMoeIntegrationTest(unittest.TestCase):
# This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
# Depending on the hardware we get different logits / generations
@@ -325,13 +326,26 @@ class GraniteMoeIntegrationTest(unittest.TestCase):
# fmt: off
# Expected mean on dim = -1
EXPECTED_MEAN = torch.tensor([[-2.2122, -1.6632, -2.9269, -2.3344, -2.0143, -3.0146, -2.6839, -2.5610]])
EXPECTED_MEANS = Expectations(
{
("xpu", 3): torch.tensor([[-4.4005, -3.6689, -3.6187, -2.8308, -3.9871, -3.1001, -2.8738, -2.8063]]),
("cuda", 7): torch.tensor([[-2.2122, -1.6632, -2.9269, -2.3344, -2.0143, -3.0146, -2.6839, -2.5610]]),
("cuda", 8): torch.tensor([[-4.4005, -3.6689, -3.6187, -2.8308, -3.9871, -3.1001, -2.8738, -2.8063]]),
}
)
EXPECTED_MEAN = EXPECTED_MEANS.get_expectation()
torch.testing.assert_close(EXPECTED_MEAN.to(torch_device), out.logits.float().mean(-1), rtol=1e-2, atol=1e-2)
# slicing logits[0, 0, 0:15]
EXPECTED_SLICE = torch.tensor([[4.8785, -2.2890, -2.2892, -2.2885, -2.2890, -3.5007, -2.2897, -2.2892,
-2.2895, -2.2891, -2.2887, -2.2882, -2.2889, -2.2898, -2.2892]])
EXPECTED_SLICES = Expectations(
{
("xpu", 3): torch.tensor([[2.5479, -9.2123, -9.2121, -9.2175, -9.2122, -1.5024, -9.2121, -9.2122, -9.2161, -9.2122, -6.3100, -3.6223, -3.6377, -5.2542, -5.2523]]),
("cuda", 7): torch.tensor([[4.8785, -2.2890, -2.2892, -2.2885, -2.2890, -3.5007, -2.2897, -2.2892, -2.2895, -2.2891, -2.2887, -2.2882, -2.2889, -2.2898, -2.2892]]),
("cuda", 8): torch.tensor([[2.5479, -9.2124, -9.2121, -9.2175, -9.2122, -1.5024, -9.2121, -9.2122, -9.2162, -9.2122, -6.3101, -3.6224, -3.6377, -5.2542, -5.2524]]),
}
)
EXPECTED_SLICE = EXPECTED_SLICES.get_expectation()
# fmt: on
self.assertTrue(
@@ -346,10 +360,26 @@ class GraniteMoeIntegrationTest(unittest.TestCase):
@slow
def test_model_3b_generation(self):
# ground truth text generated with dola_layers="low", repetition_penalty=1.2
EXPECTED_TEXT_COMPLETION = (
EXPECTED_TEXT_COMPLETIONS = Expectations(
{
("xpu", 3): (
"Simply put, the theory of relativity states that 1) the speed of light is constant, and 2) the speed of light is the same for all observers.\n\n"
"The first part is easy to understand. The second part is a little more difficult.\n\n"
"The second part of the theory of relativity is a little more difficult to understand.\n"
),
("cuda", 7): (
"Simply put, the theory of relativity states that \n$$\n\\frac{d^2x^\\mu}{d\\tau^2} = "
"\\frac{1}{c^2}\\frac{d^2x^\\mu}{dt^2}\n$$\nwhere $x^\\mu$ is a four-vector, $\\tau$ is the proper time"
),
("cuda", 8): (
"Simply put, the theory of relativity states that 1) the speed of light is constant, and 2) the speed of light is the same for all observers.\n\n"
"The first part is easy to understand. The second part is a little more difficult.\n\n"
"The second part of the theory of relativity is a little more difficult to understand.\n"
),
}
)
EXPECTED_TEXT_COMPLETION = EXPECTED_TEXT_COMPLETIONS.get_expectation()
prompt = "Simply put, the theory of relativity states that "
tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b")
model = GraniteMoeForCausalLM.from_pretrained("ibm/PowerMoE-3b", device_map="auto")

View File

@@ -19,9 +19,10 @@ from parameterized import parameterized
from transformers import AutoTokenizer, GraniteMoeSharedConfig, is_torch_available, set_seed
from transformers.testing_utils import (
Expectations,
require_read_token,
require_torch,
require_torch_gpu,
require_torch_accelerator,
slow,
torch_device,
)
@@ -304,7 +305,7 @@ class GraniteMoeSharedModelTest(ModelTesterMixin, GenerationTesterMixin, unittes
torch.testing.assert_close(yarn_sin_long, original_sin_long)
@require_torch_gpu
@require_torch_accelerator
class GraniteMoeSharedIntegrationTest(unittest.TestCase):
# This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
# Depending on the hardware we get different logits / generations
@@ -328,13 +329,26 @@ class GraniteMoeSharedIntegrationTest(unittest.TestCase):
# fmt: off
# Expected mean on dim = -1
EXPECTED_MEAN = torch.tensor([[-2.2122, -1.6632, -2.9269, -2.3344, -2.0143, -3.0146, -2.6839, -2.5610]])
EXPECTED_MEANS = Expectations(
{
("xpu", 3): torch.tensor([[-4.4005, -3.6689, -3.6187, -2.8308, -3.9871, -3.1001, -2.8738, -2.8063]]),
("cuda", 7): torch.tensor([[-2.2122, -1.6632, -2.9269, -2.3344, -2.0143, -3.0146, -2.6839, -2.5610]]),
("cuda", 8): torch.tensor([[-4.4005, -3.6689, -3.6187, -2.8308, -3.9871, -3.1001, -2.8738, -2.8063]]),
}
)
EXPECTED_MEAN = EXPECTED_MEANS.get_expectation()
torch.testing.assert_close(EXPECTED_MEAN.to(torch_device), out.logits.float().mean(-1), rtol=1e-2, atol=1e-2)
# slicing logits[0, 0, 0:15]
EXPECTED_SLICE = torch.tensor([[4.8785, -2.2890, -2.2892, -2.2885, -2.2890, -3.5007, -2.2897, -2.2892,
-2.2895, -2.2891, -2.2887, -2.2882, -2.2889, -2.2898, -2.2892]])
EXPECTED_SLICES = Expectations(
{
("xpu", 3): torch.tensor([[2.5479, -9.2123, -9.2121, -9.2175, -9.2122, -1.5024, -9.2121, -9.2122, -9.2161, -9.2122, -6.3100, -3.6223, -3.6377, -5.2542, -5.2523]]),
("cuda", 7): torch.tensor([[4.8785, -2.2890, -2.2892, -2.2885, -2.2890, -3.5007, -2.2897, -2.2892, -2.2895, -2.2891, -2.2887, -2.2882, -2.2889, -2.2898, -2.2892]]),
("cuda", 8): torch.tensor([[2.5479, -9.2123, -9.2121, -9.2175, -9.2122, -1.5024, -9.2121, -9.2122, -9.2161, -9.2122, -6.3100, -3.6223, -3.6377, -5.2542, -5.2523]]),
}
)
EXPECTED_SLICE = EXPECTED_SLICES.get_expectation()
# fmt: on
self.assertTrue(
@@ -349,10 +363,26 @@ class GraniteMoeSharedIntegrationTest(unittest.TestCase):
@slow
def test_model_3b_generation(self):
# ground truth text generated with dola_layers="low", repetition_penalty=1.2
EXPECTED_TEXT_COMPLETION = (
EXPECTED_TEXT_COMPLETIONS = Expectations(
{
("xpu", 3): (
"Simply put, the theory of relativity states that 1) the speed of light is constant, and 2) the speed of light is the same for all observers.\n\n"
"The first part is easy to understand. The second part is a little more difficult.\n\n"
"The second part of the theory of relativity is a little more difficult to understand.\n"
),
("cuda", 7): (
"Simply put, the theory of relativity states that \n$$\n\\frac{d^2x^\\mu}{d\\tau^2} = "
"\\frac{1}{c^2}\\frac{d^2x^\\mu}{dt^2}\n$$\nwhere $x^\\mu$ is a four-vector, $\\tau$ is the proper time"
),
("cuda", 8): (
"Simply put, the theory of relativity states that 1) the speed of light is constant, and 2) the speed of light is the same for all observers.\n\n"
"The first part is easy to understand. The second part is a little more difficult.\n\n"
"The second part of the theory of relativity is a little more difficult to understand.\n"
),
}
)
EXPECTED_TEXT_COMPLETION = EXPECTED_TEXT_COMPLETIONS.get_expectation()
prompt = "Simply put, the theory of relativity states that "
tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b")
model = GraniteMoeSharedForCausalLM.from_pretrained("ibm/PowerMoE-3b", device_map="auto")