From fb82a98717309c5b9b9683bf7f55dece70141cc1 Mon Sep 17 00:00:00 2001 From: Yao Matrix Date: Wed, 28 May 2025 16:30:16 +0800 Subject: [PATCH] enable large_gpu and torchao cases on XPU (#38355) * cohere2 done Signed-off-by: Matrix Yao * enable torchao cases on XPU Signed-off-by: Matrix YAO * fix Signed-off-by: Matrix YAO * fix Signed-off-by: Matrix YAO * fix Signed-off-by: Matrix YAO * rename Signed-off-by: Matrix YAO * fix Signed-off-by: Matrix YAO * fix comments Signed-off-by: Matrix YAO --------- Signed-off-by: Matrix Yao Signed-off-by: Matrix YAO --- tests/models/cohere2/test_modeling_cohere2.py | 35 +++- .../quantization/autoround/test_auto_round.py | 4 +- .../torchao_integration/test_torchao.py | 174 +++++++++++++----- 3 files changed, 151 insertions(+), 62 deletions(-) diff --git a/tests/models/cohere2/test_modeling_cohere2.py b/tests/models/cohere2/test_modeling_cohere2.py index 9282c22d41..9aa1ab7c68 100644 --- a/tests/models/cohere2/test_modeling_cohere2.py +++ b/tests/models/cohere2/test_modeling_cohere2.py @@ -23,10 +23,11 @@ from pytest import mark from transformers import AutoModelForCausalLM, AutoTokenizer, Cohere2Config, is_torch_available, pipeline from transformers.generation.configuration_utils import GenerationConfig from transformers.testing_utils import ( + Expectations, require_flash_attn, require_read_token, require_torch, - require_torch_large_gpu, + require_torch_large_accelerator, slow, torch_device, ) @@ -130,7 +131,7 @@ class Cohere2ModelTest(CohereModelTest, unittest.TestCase): @slow @require_read_token -@require_torch_large_gpu +@require_torch_large_accelerator class Cohere2IntegrationTest(unittest.TestCase): input_text = ["Hello I am doing", "Hi today"] @@ -155,10 +156,15 @@ class Cohere2IntegrationTest(unittest.TestCase): def test_model_fp16(self): model_id = "CohereForAI/c4ai-command-r7b-12-2024" - EXPECTED_TEXTS = [ - "Hello I am doing a project for a school assignment and I need to create a website for a fictional company. I have", - "Hi today I'm going to show you how to make a simple and easy to make a chocolate cake.\n", - ] + # fmt: off + EXPECTED_TEXTS = Expectations( + { + ("xpu", 3): ["Hello I am doing a project for my school and I need to create a website for a fictional company. I have the", "Hi today I'm going to show you how to make a simple and easy to make a chocolate cake.\n"], + ("cuda", 7): ["Hello I am doing a project for a school assignment and I need to create a website for a fictional company. I have", "Hi today I'm going to show you how to make a simple and easy to make a chocolate cake.\n",], + } + ) + EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation() + # fmt: on model = AutoModelForCausalLM.from_pretrained( model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, attn_implementation="eager" @@ -170,7 +176,7 @@ class Cohere2IntegrationTest(unittest.TestCase): output = model.generate(**inputs, max_new_tokens=20, do_sample=False) output_text = tokenizer.batch_decode(output, skip_special_tokens=False) - self.assertEqual(output_text, EXPECTED_TEXTS) + self.assertEqual(output_text, EXPECTED_TEXT) def test_model_pipeline_bf16(self): # See https://github.com/huggingface/transformers/pull/31747 -- pipeline was broken for Cohere2 before this PR @@ -223,9 +229,15 @@ class Cohere2IntegrationTest(unittest.TestCase): ) model_id = "CohereForAI/c4ai-command-r7b-12-2024" - EXPECTED_TEXT_COMPLETION = [ - "Hello I am doing a project on the effects of social media on mental health. I have a few questions. 1. What is the relationship", - ] + # fmt: off + EXPECTED_TEXT_COMPLETIONS = Expectations( + { + ("xpu", 3): ["Hello I am doing a project for a friend and I am stuck on a few things. I have a 2004 Ford F-"], + ("cuda", 7): ["Hello I am doing a project on the effects of social media on mental health. I have a few questions. 1. What is the relationship",], + } + ) + EXPECTED_TEXT_COMPLETION = EXPECTED_TEXT_COMPLETIONS.get_expectation() + # fmt: on tokenizer = AutoTokenizer.from_pretrained(model_id, pad_token="", padding_side="right") # Load model @@ -270,6 +282,9 @@ class Cohere2IntegrationTest(unittest.TestCase): we need to correctly slice the attention mask in all cases (because we use a HybridCache). Outputs for every attention functions should be coherent and identical. """ + if torch_device == "xpu" and attn_implementation == "flash_attention_2": + self.skipTest(reason="Intel XPU doesn't support falsh_attention_2 as of now.") + model_id = "CohereForAI/c4ai-command-r7b-12-2024" EXPECTED_COMPLETIONS = [ " the mountains, the lakes, the rivers, the waterfalls, the waterfalls, the waterfalls, the waterfalls", diff --git a/tests/quantization/autoround/test_auto_round.py b/tests/quantization/autoround/test_auto_round.py index abdf6be5d6..5eb2265d9c 100644 --- a/tests/quantization/autoround/test_auto_round.py +++ b/tests/quantization/autoround/test_auto_round.py @@ -143,9 +143,9 @@ class AutoRoundTest(unittest.TestCase): self.assertIn(output_tokens, self.EXPECTED_OUTPUTS) @require_torch_multi_accelerator - def test_quantized_model_multi_gpu(self): + def test_quantized_model_multi_accelerator(self): """ - Simple test that checks if the quantized model is working properly with multiple GPUs + Simple test that checks if the quantized model is working properly with multiple accelerators """ input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) quantization_config = AutoRoundConfig(backend="triton") diff --git a/tests/quantization/torchao_integration/test_torchao.py b/tests/quantization/torchao_integration/test_torchao.py index 37fc538fdb..bf60deef8b 100644 --- a/tests/quantization/torchao_integration/test_torchao.py +++ b/tests/quantization/torchao_integration/test_torchao.py @@ -21,10 +21,11 @@ from packaging import version from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig from transformers.testing_utils import ( + Expectations, backend_empty_cache, get_device_properties, - require_torch_gpu, - require_torch_multi_gpu, + require_torch_accelerator, + require_torch_multi_accelerator, require_torchao, require_torchao_version_greater_or_equal, torch_device, @@ -52,6 +53,8 @@ if is_torchao_available(): if version.parse(importlib.metadata.version("torchao")) >= version.parse("0.8.0"): from torchao.dtypes import Int4CPULayout + if version.parse(importlib.metadata.version("torchao")) >= version.parse("0.11.0"): + from torchao.dtypes import Int4XPULayout def check_torchao_int4_wo_quantized(test_module, qlayer): @@ -59,7 +62,13 @@ def check_torchao_int4_wo_quantized(test_module, qlayer): test_module.assertEqual(weight.quant_min, 0) test_module.assertEqual(weight.quant_max, 15) test_module.assertTrue(isinstance(weight, AffineQuantizedTensor)) - layout = Int4CPULayout if weight.device.type == "cpu" else TensorCoreTiledLayout + layout = None + if weight.device.type == "cpu": + layout = Int4CPULayout + elif weight.device.type == "xpu": + layout = Int4XPULayout + elif weight.device.type == "cuda": + layout = TensorCoreTiledLayout test_module.assertTrue(isinstance(weight.tensor_impl._layout, layout)) @@ -123,7 +132,6 @@ class TorchAoConfigTest(unittest.TestCase): class TorchAoTest(unittest.TestCase): input_text = "What are we having for dinner?" max_new_tokens = 10 - EXPECTED_OUTPUT = "What are we having for dinner?\n- 1. What is the temperature outside" model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" device = "cpu" quant_scheme_kwargs = ( @@ -132,6 +140,11 @@ class TorchAoTest(unittest.TestCase): else {"group_size": 32} ) + # called only once for all test in this class + @classmethod + def setUpClass(cls): + cls.EXPECTED_OUTPUT = "What are we having for dinner?\n- 1. What is the temperature outside" + def tearDown(self): gc.collect() backend_empty_cache(torch_device) @@ -261,11 +274,25 @@ class TorchAoTest(unittest.TestCase): self.assertTrue(tokenizer.decode(output[0], skip_special_tokens=True) in EXPECTED_OUTPUT) -@require_torch_gpu -class TorchAoGPUTest(TorchAoTest): +@require_torch_accelerator +class TorchAoAcceleratorTest(TorchAoTest): device = torch_device quant_scheme_kwargs = {"group_size": 32} + # called only once for all test in this class + @classmethod + def setUpClass(cls): + super().setUpClass() + # fmt: off + EXPECTED_OUTPUTS = Expectations( + { + ("xpu", 3): "What are we having for dinner?\n\nJessica: (smiling)", + ("cuda", 7): "What are we having for dinner?\n- 1. What is the temperature outside", + } + ) + # fmt: on + cls.EXPECTED_OUTPUT = EXPECTED_OUTPUTS.get_expectation() + def test_int4wo_offload(self): """ Simple test that checks if the quantized model int4 weight only is working properly with cpu/disk offload @@ -312,16 +339,27 @@ class TorchAoGPUTest(TorchAoTest): input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device) + # fmt: off + EXPECTED_OUTPUTS = Expectations( + { + ("xpu", 3): "What are we having for dinner?\n\nJessica: (smiling)", + ("cuda", 7): "What are we having for dinner?\n- 2. What is the temperature outside", + } + ) + # fmt: on + EXPECTED_OUTPUT = EXPECTED_OUTPUTS.get_expectation() + output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens) - EXPECTED_OUTPUT = "What are we having for dinner?\n- 2. What is the temperature outside" + generated_text = tokenizer.decode(output[0], skip_special_tokens=True) - self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT) + self.assertEqual(generated_text, EXPECTED_OUTPUT) - @require_torch_multi_gpu - def test_int4wo_quant_multi_gpu(self): + @require_torch_multi_accelerator + def test_int4wo_quant_multi_accelerator(self): """ - Simple test that checks if the quantized model int4 weight only is working properly with multiple GPUs - set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUs + Simple test that checks if the quantized model int4 weight only is working properly with multiple accelerators + set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 CUDA GPUs + set ZE_AFFINITY_MASK=0,1 if you have more than 2 Intel XPUs """ quant_config = TorchAoConfig("int4_weight_only", **self.quant_scheme_kwargs) @@ -373,7 +411,6 @@ class TorchAoGPUTest(TorchAoTest): class TorchAoSerializationTest(unittest.TestCase): input_text = "What are we having for dinner?" max_new_tokens = 10 - EXPECTED_OUTPUT = "What are we having for dinner?\n- 1. What is the temperature outside" model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" quant_scheme = "int4_weight_only" quant_scheme_kwargs = ( @@ -387,6 +424,7 @@ class TorchAoSerializationTest(unittest.TestCase): @classmethod def setUpClass(cls): cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name) + cls.EXPECTED_OUTPUT = "What are we having for dinner?\n- 1. What is the temperature outside" def setUp(self): self.quant_config = TorchAoConfig(self.quant_scheme, **self.quant_scheme_kwargs) @@ -430,58 +468,91 @@ class TorchAoSerializationTest(unittest.TestCase): class TorchAoSerializationW8A8CPUTest(TorchAoSerializationTest): quant_scheme, quant_scheme_kwargs = "int8_dynamic_activation_int8_weight", {} - EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)" - @require_torch_gpu - def test_serialization_expected_output_on_cuda(self): + # called only once for all test in this class + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)" + + @require_torch_accelerator + def test_serialization_expected_output_on_accelerator(self): """ - Test if we can serialize on device (cpu) and load/infer the model on cuda + Test if we can serialize on device (cpu) and load/infer the model on accelerator """ - self.check_serialization_expected_output("cuda", self.EXPECTED_OUTPUT) + self.check_serialization_expected_output(torch_device, self.EXPECTED_OUTPUT) class TorchAoSerializationW8CPUTest(TorchAoSerializationTest): quant_scheme, quant_scheme_kwargs = "int8_weight_only", {} - EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)" - @require_torch_gpu - def test_serialization_expected_output_on_cuda(self): + # called only once for all test in this class + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)" + + @require_torch_accelerator + def test_serialization_expected_output_on_accelerator(self): """ - Test if we can serialize on device (cpu) and load/infer the model on cuda + Test if we can serialize on device (cpu) and load/infer the model on accelerator """ - self.check_serialization_expected_output("cuda", self.EXPECTED_OUTPUT) + self.check_serialization_expected_output(torch_device, self.EXPECTED_OUTPUT) -@require_torch_gpu -class TorchAoSerializationGPTTest(TorchAoSerializationTest): +@require_torch_accelerator +class TorchAoSerializationAcceleratorTest(TorchAoSerializationTest): quant_scheme, quant_scheme_kwargs = "int4_weight_only", {"group_size": 32} device = f"{torch_device}:0" - -@require_torch_gpu -class TorchAoSerializationW8A8GPUTest(TorchAoSerializationTest): - quant_scheme, quant_scheme_kwargs = "int8_dynamic_activation_int8_weight", {} - EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)" - device = f"{torch_device}:0" - - -@require_torch_gpu -class TorchAoSerializationW8GPUTest(TorchAoSerializationTest): - quant_scheme, quant_scheme_kwargs = "int8_weight_only", {} - EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)" - device = f"{torch_device}:0" - - -@require_torch_gpu -@require_torchao_version_greater_or_equal("0.10.0") -class TorchAoSerializationFP8GPUTest(TorchAoSerializationTest): - EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)" - device = f"{torch_device}:0" - # called only once for all test in this class @classmethod def setUpClass(cls): - if not (get_device_properties()[0] == "cuda" and get_device_properties()[1] >= 9): + super().setUpClass() + # fmt: off + EXPECTED_OUTPUTS = Expectations( + { + ("xpu", 3): "What are we having for dinner?\n\nJessica: (smiling)", + ("cuda", 7): "What are we having for dinner?\n- 1. What is the temperature outside", + } + ) + # fmt: on + cls.EXPECTED_OUTPUT = EXPECTED_OUTPUTS.get_expectation() + + +@require_torch_accelerator +class TorchAoSerializationW8A8AcceleratorTest(TorchAoSerializationTest): + quant_scheme, quant_scheme_kwargs = "int8_dynamic_activation_int8_weight", {} + device = f"{torch_device}:0" + + # called only once for all test in this class + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)" + + +@require_torch_accelerator +class TorchAoSerializationW8AcceleratorTest(TorchAoSerializationTest): + quant_scheme, quant_scheme_kwargs = "int8_weight_only", {} + device = f"{torch_device}:0" + + # called only once for all test in this class + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)" + + +@require_torch_accelerator +@require_torchao_version_greater_or_equal("0.10.0") +class TorchAoSerializationFP8AcceleratorTest(TorchAoSerializationTest): + device = f"{torch_device}:0" + + # called only once for all test in this class + @classmethod + def setUpClass(cls): + if get_device_properties()[0] == "cuda" and get_device_properties()[1] < 9: raise unittest.SkipTest("CUDA compute capability 9.0 or higher required for FP8 tests") from torchao.quantization import Float8WeightOnlyConfig @@ -491,17 +562,18 @@ class TorchAoSerializationFP8GPUTest(TorchAoSerializationTest): super().setUpClass() + cls.EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)" -@require_torch_gpu + +@require_torch_accelerator @require_torchao_version_greater_or_equal("0.10.0") class TorchAoSerializationA8W4Test(TorchAoSerializationTest): - EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)" device = f"{torch_device}:0" # called only once for all test in this class @classmethod def setUpClass(cls): - if not (get_device_properties()[0] == "cuda" and get_device_properties()[1] >= 9): + if get_device_properties()[0] == "cuda" and get_device_properties()[1] < 9: raise unittest.SkipTest("CUDA compute capability 9.0 or higher required for FP8 tests") from torchao.quantization import Int8DynamicActivationInt4WeightConfig @@ -511,6 +583,8 @@ class TorchAoSerializationA8W4Test(TorchAoSerializationTest): super().setUpClass() + cls.EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)" + if __name__ == "__main__": unittest.main()