From a1b82563f11d9101d54b06fd61aef8c90f63c9d2 Mon Sep 17 00:00:00 2001 From: Yao Matrix Date: Fri, 18 Apr 2025 18:28:08 +0800 Subject: [PATCH] enable 6 modeling cases on XPU (#37571) Signed-off-by: YAO Matrix --- tests/models/bamba/test_modeling_bamba.py | 49 +++++++++++++------ tests/models/gemma/test_modeling_gemma.py | 2 +- tests/models/mpt/test_modeling_mpt.py | 8 ++- .../models/nemotron/test_modeling_nemotron.py | 17 +++++-- 4 files changed, 55 insertions(+), 21 deletions(-) diff --git a/tests/models/bamba/test_modeling_bamba.py b/tests/models/bamba/test_modeling_bamba.py index c0703f8490..2159f427bd 100644 --- a/tests/models/bamba/test_modeling_bamba.py +++ b/tests/models/bamba/test_modeling_bamba.py @@ -19,7 +19,14 @@ import unittest import pytest from transformers import AutoTokenizer, BambaConfig, is_torch_available -from transformers.testing_utils import Expectations, require_torch, require_torch_gpu, slow, torch_device +from transformers.testing_utils import ( + Expectations, + require_deterministic_for_xpu, + require_torch, + require_torch_accelerator, + slow, + torch_device, +) from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -474,7 +481,7 @@ class BambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi @slow @require_torch -@require_torch_gpu +@require_torch_accelerator class BambaModelIntegrationTest(unittest.TestCase): model = None tokenizer = None @@ -507,6 +514,10 @@ class BambaModelIntegrationTest(unittest.TestCase): "rocm", 9, ): "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are doing well. I am here", + ( + "xpu", + 3, + ): "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are all doing well. Today I", } ) @@ -536,22 +547,30 @@ class BambaModelIntegrationTest(unittest.TestCase): torch.testing.assert_close(logits[0, -1, :40].cpu(), EXPECTED_LOGITS_NO_GRAD, rtol=1e-3, atol=1) + @require_deterministic_for_xpu def test_simple_batched_generate_with_padding(self): # Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4. # # Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s, # considering differences in hardware processing and potential deviations in generated text. - EXPECTED_TEXTS = { - 7: [], - 8: [ - "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are doing well. I am here", - "!!!<|begin_of_text|>I am late! I need to get to work! I have to get to the", - ], - 9: [ - "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are doing well. I am here", - "!!!<|begin_of_text|>I am late! I need to be at the airport in 20 minutes! I", - ], - } + EXPECTED_TEXTS = Expectations( + { + ("cuda", 7): [], + ("cuda", 8): [ + "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are doing well. I am here", + "!!!<|begin_of_text|>I am late! I need to get to work! I have to get to the", + ], + ("rocm", 9): [ + "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are doing well. I am here", + "!!!<|begin_of_text|>I am late! I need to be at the airport in 20 minutes! I", + ], + ("xpu", 3): [ + "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are all doing well. Today I", + "!!!<|begin_of_text|>I am late! I need to get to work! I have to get to the", + ], + } + ) + EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation() self.model.to(torch_device) @@ -562,8 +581,8 @@ class BambaModelIntegrationTest(unittest.TestCase): ).to(torch_device) out = self.model.generate(**inputs, do_sample=False, max_new_tokens=10) output_sentences = self.tokenizer.batch_decode(out) - self.assertEqual(output_sentences[0], EXPECTED_TEXTS[self.cuda_compute_capability_major_version][0]) - self.assertEqual(output_sentences[1], EXPECTED_TEXTS[self.cuda_compute_capability_major_version][1]) + self.assertEqual(output_sentences[0], EXPECTED_TEXT[0]) + self.assertEqual(output_sentences[1], EXPECTED_TEXT[1]) # TODO: there are significant differences in the logits across major cuda versions, which shouldn't exist if self.cuda_compute_capability_major_version == 8: diff --git a/tests/models/gemma/test_modeling_gemma.py b/tests/models/gemma/test_modeling_gemma.py index 5e45e78a44..4b7293817a 100644 --- a/tests/models/gemma/test_modeling_gemma.py +++ b/tests/models/gemma/test_modeling_gemma.py @@ -643,7 +643,7 @@ class GemmaIntegrationTest(unittest.TestCase): self.assertEqual(output_text, EXPECTED_TEXTS) @slow - @require_torch_gpu + @require_torch_accelerator @require_read_token def test_compile_static_cache(self): # `torch==2.2` will throw an error on this test (as in other compilation tests), but torch==2.1.2 and torch>2.2 diff --git a/tests/models/mpt/test_modeling_mpt.py b/tests/models/mpt/test_modeling_mpt.py index 458e458dd2..844ecfd46e 100644 --- a/tests/models/mpt/test_modeling_mpt.py +++ b/tests/models/mpt/test_modeling_mpt.py @@ -520,7 +520,13 @@ class MptIntegrationTests(unittest.TestCase): outputs = model(dummy_input, output_hidden_states=True) - expected_slice = torch.Tensor([-0.2520, -0.2178, -0.1953]).to(torch_device, torch.bfloat16) + expected_slices = Expectations( + { + ("xpu", 3): torch.Tensor([-0.2090, -0.2061, -0.1465]), + ("cuda", 7): torch.Tensor([-0.2520, -0.2178, -0.1953]), + } + ) + expected_slice = expected_slices.get_expectation().to(torch_device, torch.bfloat16) predicted_slice = outputs.hidden_states[-1][0, 0, :3] torch.testing.assert_close(expected_slice, predicted_slice, rtol=1e-3, atol=1e-3) diff --git a/tests/models/nemotron/test_modeling_nemotron.py b/tests/models/nemotron/test_modeling_nemotron.py index 58c36b615f..d573537b6b 100644 --- a/tests/models/nemotron/test_modeling_nemotron.py +++ b/tests/models/nemotron/test_modeling_nemotron.py @@ -21,6 +21,7 @@ import pytest from transformers import NemotronConfig, is_torch_available from transformers.testing_utils import ( + Expectations, is_flaky, require_flash_attn, require_read_token, @@ -168,7 +169,7 @@ class NemotronModelTest(GemmaModelTest): assert torch.allclose(logits_fa, logits, atol=1e-2) -@require_torch_gpu +@require_torch_accelerator class NemotronIntegrationTest(unittest.TestCase): # This variable is used to determine which CUDA device are we using for our runners (A10 or T4) # Depending on the hardware we get different logits / generations @@ -202,9 +203,17 @@ class NemotronIntegrationTest(unittest.TestCase): @require_read_token def test_nemotron_8b_generation_eager(self): text = ["What is the largest planet in solar system?"] - EXPECTED_TEXT = [ - "What is the largest planet in solar system?\nAnswer: Jupiter\n\nWhat is the answer", - ] + EXPECTED_TEXTS = Expectations( + { + ("xpu", 3): [ + "What is the largest planet in solar system?\nAnswer: Jupiter\n\nWhat is the answer: What is the name of the 19", + ], + ("cuda", 7): [ + "What is the largest planet in solar system?\nAnswer: Jupiter\n\nWhat is the answer", + ], + } + ) + EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation() model_id = "thhaus/nemotron3-8b" model = NemotronForCausalLM.from_pretrained( model_id, torch_dtype=torch.float16, device_map="auto", attn_implementation="eager"