enable large_gpu and torchao cases on XPU (#38355)

* cohere2 done

Signed-off-by: Matrix Yao <matrix.yao@intel.com>

* enable torchao cases on XPU

Signed-off-by: Matrix YAO <matrix.yao@intel.com>

* fix

Signed-off-by: Matrix YAO <matrix.yao@intel.com>

* fix

Signed-off-by: Matrix YAO <matrix.yao@intel.com>

* fix

Signed-off-by: Matrix YAO <matrix.yao@intel.com>

* rename

Signed-off-by: Matrix YAO <matrix.yao@intel.com>

* fix

Signed-off-by: Matrix YAO <matrix.yao@intel.com>

* fix comments

Signed-off-by: Matrix YAO <matrix.yao@intel.com>

---------

Signed-off-by: Matrix Yao <matrix.yao@intel.com>
Signed-off-by: Matrix YAO <matrix.yao@intel.com>
This commit is contained in:
Yao Matrix
2025-05-28 16:30:16 +08:00
committed by GitHub
parent cea254c909
commit fb82a98717
3 changed files with 151 additions and 62 deletions

View File

@@ -23,10 +23,11 @@ from pytest import mark
from transformers import AutoModelForCausalLM, AutoTokenizer, Cohere2Config, is_torch_available, pipeline from transformers import AutoModelForCausalLM, AutoTokenizer, Cohere2Config, is_torch_available, pipeline
from transformers.generation.configuration_utils import GenerationConfig from transformers.generation.configuration_utils import GenerationConfig
from transformers.testing_utils import ( from transformers.testing_utils import (
Expectations,
require_flash_attn, require_flash_attn,
require_read_token, require_read_token,
require_torch, require_torch,
require_torch_large_gpu, require_torch_large_accelerator,
slow, slow,
torch_device, torch_device,
) )
@@ -130,7 +131,7 @@ class Cohere2ModelTest(CohereModelTest, unittest.TestCase):
@slow @slow
@require_read_token @require_read_token
@require_torch_large_gpu @require_torch_large_accelerator
class Cohere2IntegrationTest(unittest.TestCase): class Cohere2IntegrationTest(unittest.TestCase):
input_text = ["Hello I am doing", "Hi today"] input_text = ["Hello I am doing", "Hi today"]
@@ -155,10 +156,15 @@ class Cohere2IntegrationTest(unittest.TestCase):
def test_model_fp16(self): def test_model_fp16(self):
model_id = "CohereForAI/c4ai-command-r7b-12-2024" model_id = "CohereForAI/c4ai-command-r7b-12-2024"
EXPECTED_TEXTS = [ # fmt: off
"<BOS_TOKEN>Hello I am doing a project for a school assignment and I need to create a website for a fictional company. I have", EXPECTED_TEXTS = Expectations(
"<PAD><PAD><BOS_TOKEN>Hi today I'm going to show you how to make a simple and easy to make a chocolate cake.\n", {
] ("xpu", 3): ["<BOS_TOKEN>Hello I am doing a project for my school and I need to create a website for a fictional company. I have the", "<PAD><PAD><BOS_TOKEN>Hi today I'm going to show you how to make a simple and easy to make a chocolate cake.\n"],
("cuda", 7): ["<BOS_TOKEN>Hello I am doing a project for a school assignment and I need to create a website for a fictional company. I have", "<PAD><PAD><BOS_TOKEN>Hi today I'm going to show you how to make a simple and easy to make a chocolate cake.\n",],
}
)
EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
# fmt: on
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, attn_implementation="eager" model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, attn_implementation="eager"
@@ -170,7 +176,7 @@ class Cohere2IntegrationTest(unittest.TestCase):
output = model.generate(**inputs, max_new_tokens=20, do_sample=False) output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
output_text = tokenizer.batch_decode(output, skip_special_tokens=False) output_text = tokenizer.batch_decode(output, skip_special_tokens=False)
self.assertEqual(output_text, EXPECTED_TEXTS) self.assertEqual(output_text, EXPECTED_TEXT)
def test_model_pipeline_bf16(self): def test_model_pipeline_bf16(self):
# See https://github.com/huggingface/transformers/pull/31747 -- pipeline was broken for Cohere2 before this PR # See https://github.com/huggingface/transformers/pull/31747 -- pipeline was broken for Cohere2 before this PR
@@ -223,9 +229,15 @@ class Cohere2IntegrationTest(unittest.TestCase):
) )
model_id = "CohereForAI/c4ai-command-r7b-12-2024" model_id = "CohereForAI/c4ai-command-r7b-12-2024"
EXPECTED_TEXT_COMPLETION = [ # fmt: off
"Hello I am doing a project on the effects of social media on mental health. I have a few questions. 1. What is the relationship", EXPECTED_TEXT_COMPLETIONS = Expectations(
] {
("xpu", 3): ["Hello I am doing a project for a friend and I am stuck on a few things. I have a 2004 Ford F-"],
("cuda", 7): ["Hello I am doing a project on the effects of social media on mental health. I have a few questions. 1. What is the relationship",],
}
)
EXPECTED_TEXT_COMPLETION = EXPECTED_TEXT_COMPLETIONS.get_expectation()
# fmt: on
tokenizer = AutoTokenizer.from_pretrained(model_id, pad_token="<PAD>", padding_side="right") tokenizer = AutoTokenizer.from_pretrained(model_id, pad_token="<PAD>", padding_side="right")
# Load model # Load model
@@ -270,6 +282,9 @@ class Cohere2IntegrationTest(unittest.TestCase):
we need to correctly slice the attention mask in all cases (because we use a HybridCache). we need to correctly slice the attention mask in all cases (because we use a HybridCache).
Outputs for every attention functions should be coherent and identical. Outputs for every attention functions should be coherent and identical.
""" """
if torch_device == "xpu" and attn_implementation == "flash_attention_2":
self.skipTest(reason="Intel XPU doesn't support falsh_attention_2 as of now.")
model_id = "CohereForAI/c4ai-command-r7b-12-2024" model_id = "CohereForAI/c4ai-command-r7b-12-2024"
EXPECTED_COMPLETIONS = [ EXPECTED_COMPLETIONS = [
" the mountains, the lakes, the rivers, the waterfalls, the waterfalls, the waterfalls, the waterfalls", " the mountains, the lakes, the rivers, the waterfalls, the waterfalls, the waterfalls, the waterfalls",

View File

@@ -143,9 +143,9 @@ class AutoRoundTest(unittest.TestCase):
self.assertIn(output_tokens, self.EXPECTED_OUTPUTS) self.assertIn(output_tokens, self.EXPECTED_OUTPUTS)
@require_torch_multi_accelerator @require_torch_multi_accelerator
def test_quantized_model_multi_gpu(self): def test_quantized_model_multi_accelerator(self):
""" """
Simple test that checks if the quantized model is working properly with multiple GPUs Simple test that checks if the quantized model is working properly with multiple accelerators
""" """
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantization_config = AutoRoundConfig(backend="triton") quantization_config = AutoRoundConfig(backend="triton")

View File

@@ -21,10 +21,11 @@ from packaging import version
from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
from transformers.testing_utils import ( from transformers.testing_utils import (
Expectations,
backend_empty_cache, backend_empty_cache,
get_device_properties, get_device_properties,
require_torch_gpu, require_torch_accelerator,
require_torch_multi_gpu, require_torch_multi_accelerator,
require_torchao, require_torchao,
require_torchao_version_greater_or_equal, require_torchao_version_greater_or_equal,
torch_device, torch_device,
@@ -52,6 +53,8 @@ if is_torchao_available():
if version.parse(importlib.metadata.version("torchao")) >= version.parse("0.8.0"): if version.parse(importlib.metadata.version("torchao")) >= version.parse("0.8.0"):
from torchao.dtypes import Int4CPULayout from torchao.dtypes import Int4CPULayout
if version.parse(importlib.metadata.version("torchao")) >= version.parse("0.11.0"):
from torchao.dtypes import Int4XPULayout
def check_torchao_int4_wo_quantized(test_module, qlayer): def check_torchao_int4_wo_quantized(test_module, qlayer):
@@ -59,7 +62,13 @@ def check_torchao_int4_wo_quantized(test_module, qlayer):
test_module.assertEqual(weight.quant_min, 0) test_module.assertEqual(weight.quant_min, 0)
test_module.assertEqual(weight.quant_max, 15) test_module.assertEqual(weight.quant_max, 15)
test_module.assertTrue(isinstance(weight, AffineQuantizedTensor)) test_module.assertTrue(isinstance(weight, AffineQuantizedTensor))
layout = Int4CPULayout if weight.device.type == "cpu" else TensorCoreTiledLayout layout = None
if weight.device.type == "cpu":
layout = Int4CPULayout
elif weight.device.type == "xpu":
layout = Int4XPULayout
elif weight.device.type == "cuda":
layout = TensorCoreTiledLayout
test_module.assertTrue(isinstance(weight.tensor_impl._layout, layout)) test_module.assertTrue(isinstance(weight.tensor_impl._layout, layout))
@@ -123,7 +132,6 @@ class TorchAoConfigTest(unittest.TestCase):
class TorchAoTest(unittest.TestCase): class TorchAoTest(unittest.TestCase):
input_text = "What are we having for dinner?" input_text = "What are we having for dinner?"
max_new_tokens = 10 max_new_tokens = 10
EXPECTED_OUTPUT = "What are we having for dinner?\n- 1. What is the temperature outside"
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
device = "cpu" device = "cpu"
quant_scheme_kwargs = ( quant_scheme_kwargs = (
@@ -132,6 +140,11 @@ class TorchAoTest(unittest.TestCase):
else {"group_size": 32} else {"group_size": 32}
) )
# called only once for all test in this class
@classmethod
def setUpClass(cls):
cls.EXPECTED_OUTPUT = "What are we having for dinner?\n- 1. What is the temperature outside"
def tearDown(self): def tearDown(self):
gc.collect() gc.collect()
backend_empty_cache(torch_device) backend_empty_cache(torch_device)
@@ -261,11 +274,25 @@ class TorchAoTest(unittest.TestCase):
self.assertTrue(tokenizer.decode(output[0], skip_special_tokens=True) in EXPECTED_OUTPUT) self.assertTrue(tokenizer.decode(output[0], skip_special_tokens=True) in EXPECTED_OUTPUT)
@require_torch_gpu @require_torch_accelerator
class TorchAoGPUTest(TorchAoTest): class TorchAoAcceleratorTest(TorchAoTest):
device = torch_device device = torch_device
quant_scheme_kwargs = {"group_size": 32} quant_scheme_kwargs = {"group_size": 32}
# called only once for all test in this class
@classmethod
def setUpClass(cls):
super().setUpClass()
# fmt: off
EXPECTED_OUTPUTS = Expectations(
{
("xpu", 3): "What are we having for dinner?\n\nJessica: (smiling)",
("cuda", 7): "What are we having for dinner?\n- 1. What is the temperature outside",
}
)
# fmt: on
cls.EXPECTED_OUTPUT = EXPECTED_OUTPUTS.get_expectation()
def test_int4wo_offload(self): def test_int4wo_offload(self):
""" """
Simple test that checks if the quantized model int4 weight only is working properly with cpu/disk offload Simple test that checks if the quantized model int4 weight only is working properly with cpu/disk offload
@@ -312,16 +339,27 @@ class TorchAoGPUTest(TorchAoTest):
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device) input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
# fmt: off
EXPECTED_OUTPUTS = Expectations(
{
("xpu", 3): "What are we having for dinner?\n\nJessica: (smiling)",
("cuda", 7): "What are we having for dinner?\n- 2. What is the temperature outside",
}
)
# fmt: on
EXPECTED_OUTPUT = EXPECTED_OUTPUTS.get_expectation()
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens) output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
EXPECTED_OUTPUT = "What are we having for dinner?\n- 2. What is the temperature outside" generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT) self.assertEqual(generated_text, EXPECTED_OUTPUT)
@require_torch_multi_gpu @require_torch_multi_accelerator
def test_int4wo_quant_multi_gpu(self): def test_int4wo_quant_multi_accelerator(self):
""" """
Simple test that checks if the quantized model int4 weight only is working properly with multiple GPUs Simple test that checks if the quantized model int4 weight only is working properly with multiple accelerators
set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUs set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 CUDA GPUs
set ZE_AFFINITY_MASK=0,1 if you have more than 2 Intel XPUs
""" """
quant_config = TorchAoConfig("int4_weight_only", **self.quant_scheme_kwargs) quant_config = TorchAoConfig("int4_weight_only", **self.quant_scheme_kwargs)
@@ -373,7 +411,6 @@ class TorchAoGPUTest(TorchAoTest):
class TorchAoSerializationTest(unittest.TestCase): class TorchAoSerializationTest(unittest.TestCase):
input_text = "What are we having for dinner?" input_text = "What are we having for dinner?"
max_new_tokens = 10 max_new_tokens = 10
EXPECTED_OUTPUT = "What are we having for dinner?\n- 1. What is the temperature outside"
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
quant_scheme = "int4_weight_only" quant_scheme = "int4_weight_only"
quant_scheme_kwargs = ( quant_scheme_kwargs = (
@@ -387,6 +424,7 @@ class TorchAoSerializationTest(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name) cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.EXPECTED_OUTPUT = "What are we having for dinner?\n- 1. What is the temperature outside"
def setUp(self): def setUp(self):
self.quant_config = TorchAoConfig(self.quant_scheme, **self.quant_scheme_kwargs) self.quant_config = TorchAoConfig(self.quant_scheme, **self.quant_scheme_kwargs)
@@ -430,58 +468,91 @@ class TorchAoSerializationTest(unittest.TestCase):
class TorchAoSerializationW8A8CPUTest(TorchAoSerializationTest): class TorchAoSerializationW8A8CPUTest(TorchAoSerializationTest):
quant_scheme, quant_scheme_kwargs = "int8_dynamic_activation_int8_weight", {} quant_scheme, quant_scheme_kwargs = "int8_dynamic_activation_int8_weight", {}
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
@require_torch_gpu # called only once for all test in this class
def test_serialization_expected_output_on_cuda(self): @classmethod
def setUpClass(cls):
super().setUpClass()
cls.EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
@require_torch_accelerator
def test_serialization_expected_output_on_accelerator(self):
""" """
Test if we can serialize on device (cpu) and load/infer the model on cuda Test if we can serialize on device (cpu) and load/infer the model on accelerator
""" """
self.check_serialization_expected_output("cuda", self.EXPECTED_OUTPUT) self.check_serialization_expected_output(torch_device, self.EXPECTED_OUTPUT)
class TorchAoSerializationW8CPUTest(TorchAoSerializationTest): class TorchAoSerializationW8CPUTest(TorchAoSerializationTest):
quant_scheme, quant_scheme_kwargs = "int8_weight_only", {} quant_scheme, quant_scheme_kwargs = "int8_weight_only", {}
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
@require_torch_gpu # called only once for all test in this class
def test_serialization_expected_output_on_cuda(self): @classmethod
def setUpClass(cls):
super().setUpClass()
cls.EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
@require_torch_accelerator
def test_serialization_expected_output_on_accelerator(self):
""" """
Test if we can serialize on device (cpu) and load/infer the model on cuda Test if we can serialize on device (cpu) and load/infer the model on accelerator
""" """
self.check_serialization_expected_output("cuda", self.EXPECTED_OUTPUT) self.check_serialization_expected_output(torch_device, self.EXPECTED_OUTPUT)
@require_torch_gpu @require_torch_accelerator
class TorchAoSerializationGPTTest(TorchAoSerializationTest): class TorchAoSerializationAcceleratorTest(TorchAoSerializationTest):
quant_scheme, quant_scheme_kwargs = "int4_weight_only", {"group_size": 32} quant_scheme, quant_scheme_kwargs = "int4_weight_only", {"group_size": 32}
device = f"{torch_device}:0" device = f"{torch_device}:0"
@require_torch_gpu
class TorchAoSerializationW8A8GPUTest(TorchAoSerializationTest):
quant_scheme, quant_scheme_kwargs = "int8_dynamic_activation_int8_weight", {}
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
device = f"{torch_device}:0"
@require_torch_gpu
class TorchAoSerializationW8GPUTest(TorchAoSerializationTest):
quant_scheme, quant_scheme_kwargs = "int8_weight_only", {}
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
device = f"{torch_device}:0"
@require_torch_gpu
@require_torchao_version_greater_or_equal("0.10.0")
class TorchAoSerializationFP8GPUTest(TorchAoSerializationTest):
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
device = f"{torch_device}:0"
# called only once for all test in this class # called only once for all test in this class
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
if not (get_device_properties()[0] == "cuda" and get_device_properties()[1] >= 9): super().setUpClass()
# fmt: off
EXPECTED_OUTPUTS = Expectations(
{
("xpu", 3): "What are we having for dinner?\n\nJessica: (smiling)",
("cuda", 7): "What are we having for dinner?\n- 1. What is the temperature outside",
}
)
# fmt: on
cls.EXPECTED_OUTPUT = EXPECTED_OUTPUTS.get_expectation()
@require_torch_accelerator
class TorchAoSerializationW8A8AcceleratorTest(TorchAoSerializationTest):
quant_scheme, quant_scheme_kwargs = "int8_dynamic_activation_int8_weight", {}
device = f"{torch_device}:0"
# called only once for all test in this class
@classmethod
def setUpClass(cls):
super().setUpClass()
cls.EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
@require_torch_accelerator
class TorchAoSerializationW8AcceleratorTest(TorchAoSerializationTest):
quant_scheme, quant_scheme_kwargs = "int8_weight_only", {}
device = f"{torch_device}:0"
# called only once for all test in this class
@classmethod
def setUpClass(cls):
super().setUpClass()
cls.EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
@require_torch_accelerator
@require_torchao_version_greater_or_equal("0.10.0")
class TorchAoSerializationFP8AcceleratorTest(TorchAoSerializationTest):
device = f"{torch_device}:0"
# called only once for all test in this class
@classmethod
def setUpClass(cls):
if get_device_properties()[0] == "cuda" and get_device_properties()[1] < 9:
raise unittest.SkipTest("CUDA compute capability 9.0 or higher required for FP8 tests") raise unittest.SkipTest("CUDA compute capability 9.0 or higher required for FP8 tests")
from torchao.quantization import Float8WeightOnlyConfig from torchao.quantization import Float8WeightOnlyConfig
@@ -491,17 +562,18 @@ class TorchAoSerializationFP8GPUTest(TorchAoSerializationTest):
super().setUpClass() super().setUpClass()
cls.EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
@require_torch_gpu
@require_torch_accelerator
@require_torchao_version_greater_or_equal("0.10.0") @require_torchao_version_greater_or_equal("0.10.0")
class TorchAoSerializationA8W4Test(TorchAoSerializationTest): class TorchAoSerializationA8W4Test(TorchAoSerializationTest):
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
device = f"{torch_device}:0" device = f"{torch_device}:0"
# called only once for all test in this class # called only once for all test in this class
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
if not (get_device_properties()[0] == "cuda" and get_device_properties()[1] >= 9): if get_device_properties()[0] == "cuda" and get_device_properties()[1] < 9:
raise unittest.SkipTest("CUDA compute capability 9.0 or higher required for FP8 tests") raise unittest.SkipTest("CUDA compute capability 9.0 or higher required for FP8 tests")
from torchao.quantization import Int8DynamicActivationInt4WeightConfig from torchao.quantization import Int8DynamicActivationInt4WeightConfig
@@ -511,6 +583,8 @@ class TorchAoSerializationA8W4Test(TorchAoSerializationTest):
super().setUpClass() super().setUpClass()
cls.EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()