enable large_gpu and torchao cases on XPU (#38355)

* cohere2 done

Signed-off-by: Matrix Yao <matrix.yao@intel.com>

* enable torchao cases on XPU

Signed-off-by: Matrix YAO <matrix.yao@intel.com>

* fix

Signed-off-by: Matrix YAO <matrix.yao@intel.com>

* fix

Signed-off-by: Matrix YAO <matrix.yao@intel.com>

* fix

Signed-off-by: Matrix YAO <matrix.yao@intel.com>

* rename

Signed-off-by: Matrix YAO <matrix.yao@intel.com>

* fix

Signed-off-by: Matrix YAO <matrix.yao@intel.com>

* fix comments

Signed-off-by: Matrix YAO <matrix.yao@intel.com>

---------

Signed-off-by: Matrix Yao <matrix.yao@intel.com>
Signed-off-by: Matrix YAO <matrix.yao@intel.com>
This commit is contained in:
Yao Matrix
2025-05-28 16:30:16 +08:00
committed by GitHub
parent cea254c909
commit fb82a98717
3 changed files with 151 additions and 62 deletions

View File

@@ -23,10 +23,11 @@ from pytest import mark
from transformers import AutoModelForCausalLM, AutoTokenizer, Cohere2Config, is_torch_available, pipeline
from transformers.generation.configuration_utils import GenerationConfig
from transformers.testing_utils import (
Expectations,
require_flash_attn,
require_read_token,
require_torch,
require_torch_large_gpu,
require_torch_large_accelerator,
slow,
torch_device,
)
@@ -130,7 +131,7 @@ class Cohere2ModelTest(CohereModelTest, unittest.TestCase):
@slow
@require_read_token
@require_torch_large_gpu
@require_torch_large_accelerator
class Cohere2IntegrationTest(unittest.TestCase):
input_text = ["Hello I am doing", "Hi today"]
@@ -155,10 +156,15 @@ class Cohere2IntegrationTest(unittest.TestCase):
def test_model_fp16(self):
model_id = "CohereForAI/c4ai-command-r7b-12-2024"
EXPECTED_TEXTS = [
"<BOS_TOKEN>Hello I am doing a project for a school assignment and I need to create a website for a fictional company. I have",
"<PAD><PAD><BOS_TOKEN>Hi today I'm going to show you how to make a simple and easy to make a chocolate cake.\n",
]
# fmt: off
EXPECTED_TEXTS = Expectations(
{
("xpu", 3): ["<BOS_TOKEN>Hello I am doing a project for my school and I need to create a website for a fictional company. I have the", "<PAD><PAD><BOS_TOKEN>Hi today I'm going to show you how to make a simple and easy to make a chocolate cake.\n"],
("cuda", 7): ["<BOS_TOKEN>Hello I am doing a project for a school assignment and I need to create a website for a fictional company. I have", "<PAD><PAD><BOS_TOKEN>Hi today I'm going to show you how to make a simple and easy to make a chocolate cake.\n",],
}
)
EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
# fmt: on
model = AutoModelForCausalLM.from_pretrained(
model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, attn_implementation="eager"
@@ -170,7 +176,7 @@ class Cohere2IntegrationTest(unittest.TestCase):
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
output_text = tokenizer.batch_decode(output, skip_special_tokens=False)
self.assertEqual(output_text, EXPECTED_TEXTS)
self.assertEqual(output_text, EXPECTED_TEXT)
def test_model_pipeline_bf16(self):
# See https://github.com/huggingface/transformers/pull/31747 -- pipeline was broken for Cohere2 before this PR
@@ -223,9 +229,15 @@ class Cohere2IntegrationTest(unittest.TestCase):
)
model_id = "CohereForAI/c4ai-command-r7b-12-2024"
EXPECTED_TEXT_COMPLETION = [
"Hello I am doing a project on the effects of social media on mental health. I have a few questions. 1. What is the relationship",
]
# fmt: off
EXPECTED_TEXT_COMPLETIONS = Expectations(
{
("xpu", 3): ["Hello I am doing a project for a friend and I am stuck on a few things. I have a 2004 Ford F-"],
("cuda", 7): ["Hello I am doing a project on the effects of social media on mental health. I have a few questions. 1. What is the relationship",],
}
)
EXPECTED_TEXT_COMPLETION = EXPECTED_TEXT_COMPLETIONS.get_expectation()
# fmt: on
tokenizer = AutoTokenizer.from_pretrained(model_id, pad_token="<PAD>", padding_side="right")
# Load model
@@ -270,6 +282,9 @@ class Cohere2IntegrationTest(unittest.TestCase):
we need to correctly slice the attention mask in all cases (because we use a HybridCache).
Outputs for every attention functions should be coherent and identical.
"""
if torch_device == "xpu" and attn_implementation == "flash_attention_2":
self.skipTest(reason="Intel XPU doesn't support falsh_attention_2 as of now.")
model_id = "CohereForAI/c4ai-command-r7b-12-2024"
EXPECTED_COMPLETIONS = [
" the mountains, the lakes, the rivers, the waterfalls, the waterfalls, the waterfalls, the waterfalls",