enable 6 granite cases on xpu (#37569)
* enable 6 granite cases on XPU Signed-off-by: YAO Matrix <matrix.yao@intel.com> * make them all pass on A100 Signed-off-by: N <matrix.yao@intel.com> * fix style Signed-off-by: YAO Matrix <matrix.yao@intel.com> * update --------- Signed-off-by: YAO Matrix <matrix.yao@intel.com> Signed-off-by: N <matrix.yao@intel.com> Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
@@ -19,9 +19,10 @@ from parameterized import parameterized
|
||||
|
||||
from transformers import GraniteConfig, is_torch_available, set_seed
|
||||
from transformers.testing_utils import (
|
||||
Expectations,
|
||||
require_read_token,
|
||||
require_torch,
|
||||
require_torch_gpu,
|
||||
require_torch_accelerator,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
@@ -302,7 +303,7 @@ class GraniteModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
|
||||
torch.testing.assert_close(yarn_sin_long, original_sin_long)
|
||||
|
||||
|
||||
@require_torch_gpu
|
||||
@require_torch_accelerator
|
||||
class GraniteIntegrationTest(unittest.TestCase):
|
||||
# This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
|
||||
# Depending on the hardware we get different logits / generations
|
||||
@@ -328,15 +329,27 @@ class GraniteIntegrationTest(unittest.TestCase):
|
||||
# Expected mean on dim = -1
|
||||
|
||||
# fmt: off
|
||||
EXPECTED_MEAN = torch.tensor([[-1.9798, -3.1626, -2.8062, -2.3777, -2.7091, -2.2338, -2.5924, -2.3974]])
|
||||
EXPECTED_MEANS = Expectations(
|
||||
{
|
||||
("xpu", 3): torch.tensor([[-3.1406, -2.5469, -2.6250, -2.1250, -2.6250, -2.6562, -2.6875, -2.9688]]),
|
||||
("cuda", 7): torch.tensor([[-1.9798, -3.1626, -2.8062, -2.3777, -2.7091, -2.2338, -2.5924, -2.3974]]),
|
||||
("cuda", 8): torch.tensor([[-3.1406, -2.5469, -2.6250, -2.1250, -2.6250, -2.6562, -2.6875, -2.9688]]),
|
||||
}
|
||||
)
|
||||
EXPECTED_MEAN = EXPECTED_MEANS.get_expectation()
|
||||
|
||||
torch.testing.assert_close(EXPECTED_MEAN.to(torch_device), out.logits.mean(-1), rtol=1e-2, atol=1e-2)
|
||||
torch.testing.assert_close(EXPECTED_MEAN.to(torch_device), out.logits.mean(-1).float(), rtol=1e-2, atol=1e-2)
|
||||
|
||||
# slicing logits[0, 0, 0:15]
|
||||
EXPECTED_SLICE = torch.tensor([[4.8750, -2.1875, -2.1875, -2.1875, -2.1875, -2.8438, -2.1875, -2.1875,
|
||||
-2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875]])
|
||||
EXPECTED_SLICES = Expectations(
|
||||
{
|
||||
("xpu", 3): torch.tensor([[2.2031, -5.0625, -5.0625, -5.0625, -5.0625, -0.9180, -5.0625, -5.0625, -5.0625, -5.0625, -5.5312, -2.1719, -1.7891, -0.4922, -2.5469]]),
|
||||
("cuda", 7): torch.tensor([[4.8750, -2.1875, -2.1875, -2.1875, -2.1875, -2.8438, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875]]),
|
||||
("cuda", 8): torch.tensor([[2.0938, -5.0312, -5.0312, -5.0312, -5.0312, -1.0469, -5.0312, -5.0312, -5.0312, -5.0312, -5.5625, -2.1875, -1.7891, -0.5820, -2.6250]]),
|
||||
}
|
||||
)
|
||||
EXPECTED_SLICE = EXPECTED_SLICES.get_expectation()
|
||||
# fmt: on
|
||||
|
||||
self.assertTrue(
|
||||
torch.allclose(
|
||||
EXPECTED_SLICE.to(torch_device),
|
||||
@@ -358,6 +371,13 @@ class GraniteIntegrationTest(unittest.TestCase):
|
||||
|
||||
# fmt: off
|
||||
# Expected mean on dim = -1
|
||||
EXPECTED_MEAN = torch.tensor([[-2.0984, -3.1294, -2.8153, -2.3568, -2.7337, -2.2624, -2.6016, -2.4022]])
|
||||
EXPECTED_MEANS = Expectations(
|
||||
{
|
||||
("xpu", 3): torch.tensor([[-3.2693, -2.5957, -2.6234, -2.1675, -2.6386, -2.6850, -2.7039, -2.9656]]),
|
||||
("cuda", 7): torch.tensor([[-2.0984, -3.1294, -2.8153, -2.3568, -2.7337, -2.2624, -2.6016, -2.4022]]),
|
||||
("cuda", 8): torch.tensor([[-3.2934, -2.6019, -2.6258, -2.1691, -2.6394, -2.6876, -2.7032, -2.9688]]),
|
||||
}
|
||||
)
|
||||
EXPECTED_MEAN = EXPECTED_MEANS.get_expectation()
|
||||
|
||||
torch.testing.assert_close(EXPECTED_MEAN.to(torch_device), out.logits.float().mean(-1), rtol=1e-2, atol=1e-2)
|
||||
|
||||
@@ -19,9 +19,10 @@ from parameterized import parameterized
|
||||
|
||||
from transformers import AutoTokenizer, GraniteMoeConfig, is_torch_available, set_seed
|
||||
from transformers.testing_utils import (
|
||||
Expectations,
|
||||
require_read_token,
|
||||
require_torch,
|
||||
require_torch_gpu,
|
||||
require_torch_accelerator,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
@@ -301,7 +302,7 @@ class GraniteMoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
|
||||
torch.testing.assert_close(yarn_sin_long, original_sin_long)
|
||||
|
||||
|
||||
@require_torch_gpu
|
||||
@require_torch_accelerator
|
||||
class GraniteMoeIntegrationTest(unittest.TestCase):
|
||||
# This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
|
||||
# Depending on the hardware we get different logits / generations
|
||||
@@ -325,13 +326,26 @@ class GraniteMoeIntegrationTest(unittest.TestCase):
|
||||
|
||||
# fmt: off
|
||||
# Expected mean on dim = -1
|
||||
EXPECTED_MEAN = torch.tensor([[-2.2122, -1.6632, -2.9269, -2.3344, -2.0143, -3.0146, -2.6839, -2.5610]])
|
||||
EXPECTED_MEANS = Expectations(
|
||||
{
|
||||
("xpu", 3): torch.tensor([[-4.4005, -3.6689, -3.6187, -2.8308, -3.9871, -3.1001, -2.8738, -2.8063]]),
|
||||
("cuda", 7): torch.tensor([[-2.2122, -1.6632, -2.9269, -2.3344, -2.0143, -3.0146, -2.6839, -2.5610]]),
|
||||
("cuda", 8): torch.tensor([[-4.4005, -3.6689, -3.6187, -2.8308, -3.9871, -3.1001, -2.8738, -2.8063]]),
|
||||
}
|
||||
)
|
||||
EXPECTED_MEAN = EXPECTED_MEANS.get_expectation()
|
||||
|
||||
torch.testing.assert_close(EXPECTED_MEAN.to(torch_device), out.logits.float().mean(-1), rtol=1e-2, atol=1e-2)
|
||||
|
||||
# slicing logits[0, 0, 0:15]
|
||||
EXPECTED_SLICE = torch.tensor([[4.8785, -2.2890, -2.2892, -2.2885, -2.2890, -3.5007, -2.2897, -2.2892,
|
||||
-2.2895, -2.2891, -2.2887, -2.2882, -2.2889, -2.2898, -2.2892]])
|
||||
EXPECTED_SLICES = Expectations(
|
||||
{
|
||||
("xpu", 3): torch.tensor([[2.5479, -9.2123, -9.2121, -9.2175, -9.2122, -1.5024, -9.2121, -9.2122, -9.2161, -9.2122, -6.3100, -3.6223, -3.6377, -5.2542, -5.2523]]),
|
||||
("cuda", 7): torch.tensor([[4.8785, -2.2890, -2.2892, -2.2885, -2.2890, -3.5007, -2.2897, -2.2892, -2.2895, -2.2891, -2.2887, -2.2882, -2.2889, -2.2898, -2.2892]]),
|
||||
("cuda", 8): torch.tensor([[2.5479, -9.2124, -9.2121, -9.2175, -9.2122, -1.5024, -9.2121, -9.2122, -9.2162, -9.2122, -6.3101, -3.6224, -3.6377, -5.2542, -5.2524]]),
|
||||
}
|
||||
)
|
||||
EXPECTED_SLICE = EXPECTED_SLICES.get_expectation()
|
||||
# fmt: on
|
||||
|
||||
self.assertTrue(
|
||||
@@ -346,10 +360,26 @@ class GraniteMoeIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
def test_model_3b_generation(self):
|
||||
# ground truth text generated with dola_layers="low", repetition_penalty=1.2
|
||||
EXPECTED_TEXT_COMPLETION = (
|
||||
"Simply put, the theory of relativity states that \n$$\n\\frac{d^2x^\\mu}{d\\tau^2} = "
|
||||
"\\frac{1}{c^2}\\frac{d^2x^\\mu}{dt^2}\n$$\nwhere $x^\\mu$ is a four-vector, $\\tau$ is the proper time"
|
||||
EXPECTED_TEXT_COMPLETIONS = Expectations(
|
||||
{
|
||||
("xpu", 3): (
|
||||
"Simply put, the theory of relativity states that 1) the speed of light is constant, and 2) the speed of light is the same for all observers.\n\n"
|
||||
"The first part is easy to understand. The second part is a little more difficult.\n\n"
|
||||
"The second part of the theory of relativity is a little more difficult to understand.\n"
|
||||
),
|
||||
("cuda", 7): (
|
||||
"Simply put, the theory of relativity states that \n$$\n\\frac{d^2x^\\mu}{d\\tau^2} = "
|
||||
"\\frac{1}{c^2}\\frac{d^2x^\\mu}{dt^2}\n$$\nwhere $x^\\mu$ is a four-vector, $\\tau$ is the proper time"
|
||||
),
|
||||
("cuda", 8): (
|
||||
"Simply put, the theory of relativity states that 1) the speed of light is constant, and 2) the speed of light is the same for all observers.\n\n"
|
||||
"The first part is easy to understand. The second part is a little more difficult.\n\n"
|
||||
"The second part of the theory of relativity is a little more difficult to understand.\n"
|
||||
),
|
||||
}
|
||||
)
|
||||
EXPECTED_TEXT_COMPLETION = EXPECTED_TEXT_COMPLETIONS.get_expectation()
|
||||
|
||||
prompt = "Simply put, the theory of relativity states that "
|
||||
tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b")
|
||||
model = GraniteMoeForCausalLM.from_pretrained("ibm/PowerMoE-3b", device_map="auto")
|
||||
|
||||
@@ -19,9 +19,10 @@ from parameterized import parameterized
|
||||
|
||||
from transformers import AutoTokenizer, GraniteMoeSharedConfig, is_torch_available, set_seed
|
||||
from transformers.testing_utils import (
|
||||
Expectations,
|
||||
require_read_token,
|
||||
require_torch,
|
||||
require_torch_gpu,
|
||||
require_torch_accelerator,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
@@ -304,7 +305,7 @@ class GraniteMoeSharedModelTest(ModelTesterMixin, GenerationTesterMixin, unittes
|
||||
torch.testing.assert_close(yarn_sin_long, original_sin_long)
|
||||
|
||||
|
||||
@require_torch_gpu
|
||||
@require_torch_accelerator
|
||||
class GraniteMoeSharedIntegrationTest(unittest.TestCase):
|
||||
# This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
|
||||
# Depending on the hardware we get different logits / generations
|
||||
@@ -328,13 +329,26 @@ class GraniteMoeSharedIntegrationTest(unittest.TestCase):
|
||||
|
||||
# fmt: off
|
||||
# Expected mean on dim = -1
|
||||
EXPECTED_MEAN = torch.tensor([[-2.2122, -1.6632, -2.9269, -2.3344, -2.0143, -3.0146, -2.6839, -2.5610]])
|
||||
EXPECTED_MEANS = Expectations(
|
||||
{
|
||||
("xpu", 3): torch.tensor([[-4.4005, -3.6689, -3.6187, -2.8308, -3.9871, -3.1001, -2.8738, -2.8063]]),
|
||||
("cuda", 7): torch.tensor([[-2.2122, -1.6632, -2.9269, -2.3344, -2.0143, -3.0146, -2.6839, -2.5610]]),
|
||||
("cuda", 8): torch.tensor([[-4.4005, -3.6689, -3.6187, -2.8308, -3.9871, -3.1001, -2.8738, -2.8063]]),
|
||||
}
|
||||
)
|
||||
|
||||
EXPECTED_MEAN = EXPECTED_MEANS.get_expectation()
|
||||
torch.testing.assert_close(EXPECTED_MEAN.to(torch_device), out.logits.float().mean(-1), rtol=1e-2, atol=1e-2)
|
||||
|
||||
# slicing logits[0, 0, 0:15]
|
||||
EXPECTED_SLICE = torch.tensor([[4.8785, -2.2890, -2.2892, -2.2885, -2.2890, -3.5007, -2.2897, -2.2892,
|
||||
-2.2895, -2.2891, -2.2887, -2.2882, -2.2889, -2.2898, -2.2892]])
|
||||
EXPECTED_SLICES = Expectations(
|
||||
{
|
||||
("xpu", 3): torch.tensor([[2.5479, -9.2123, -9.2121, -9.2175, -9.2122, -1.5024, -9.2121, -9.2122, -9.2161, -9.2122, -6.3100, -3.6223, -3.6377, -5.2542, -5.2523]]),
|
||||
("cuda", 7): torch.tensor([[4.8785, -2.2890, -2.2892, -2.2885, -2.2890, -3.5007, -2.2897, -2.2892, -2.2895, -2.2891, -2.2887, -2.2882, -2.2889, -2.2898, -2.2892]]),
|
||||
("cuda", 8): torch.tensor([[2.5479, -9.2123, -9.2121, -9.2175, -9.2122, -1.5024, -9.2121, -9.2122, -9.2161, -9.2122, -6.3100, -3.6223, -3.6377, -5.2542, -5.2523]]),
|
||||
}
|
||||
)
|
||||
EXPECTED_SLICE = EXPECTED_SLICES.get_expectation()
|
||||
# fmt: on
|
||||
|
||||
self.assertTrue(
|
||||
@@ -349,10 +363,26 @@ class GraniteMoeSharedIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
def test_model_3b_generation(self):
|
||||
# ground truth text generated with dola_layers="low", repetition_penalty=1.2
|
||||
EXPECTED_TEXT_COMPLETION = (
|
||||
"Simply put, the theory of relativity states that \n$$\n\\frac{d^2x^\\mu}{d\\tau^2} = "
|
||||
"\\frac{1}{c^2}\\frac{d^2x^\\mu}{dt^2}\n$$\nwhere $x^\\mu$ is a four-vector, $\\tau$ is the proper time"
|
||||
EXPECTED_TEXT_COMPLETIONS = Expectations(
|
||||
{
|
||||
("xpu", 3): (
|
||||
"Simply put, the theory of relativity states that 1) the speed of light is constant, and 2) the speed of light is the same for all observers.\n\n"
|
||||
"The first part is easy to understand. The second part is a little more difficult.\n\n"
|
||||
"The second part of the theory of relativity is a little more difficult to understand.\n"
|
||||
),
|
||||
("cuda", 7): (
|
||||
"Simply put, the theory of relativity states that \n$$\n\\frac{d^2x^\\mu}{d\\tau^2} = "
|
||||
"\\frac{1}{c^2}\\frac{d^2x^\\mu}{dt^2}\n$$\nwhere $x^\\mu$ is a four-vector, $\\tau$ is the proper time"
|
||||
),
|
||||
("cuda", 8): (
|
||||
"Simply put, the theory of relativity states that 1) the speed of light is constant, and 2) the speed of light is the same for all observers.\n\n"
|
||||
"The first part is easy to understand. The second part is a little more difficult.\n\n"
|
||||
"The second part of the theory of relativity is a little more difficult to understand.\n"
|
||||
),
|
||||
}
|
||||
)
|
||||
EXPECTED_TEXT_COMPLETION = EXPECTED_TEXT_COMPLETIONS.get_expectation()
|
||||
|
||||
prompt = "Simply put, the theory of relativity states that "
|
||||
tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b")
|
||||
model = GraniteMoeSharedForCausalLM.from_pretrained("ibm/PowerMoE-3b", device_map="auto")
|
||||
|
||||
Reference in New Issue
Block a user