enable large_gpu and torchao cases on XPU (#38355)

* cohere2 done Signed-off-by: Matrix Yao <matrix.yao@intel.com> * enable torchao cases on XPU Signed-off-by: Matrix YAO <matrix.yao@intel.com> * fix Signed-off-by: Matrix YAO <matrix.yao@intel.com> * fix Signed-off-by: Matrix YAO <matrix.yao@intel.com> * fix Signed-off-by: Matrix YAO <matrix.yao@intel.com> * rename Signed-off-by: Matrix YAO <matrix.yao@intel.com> * fix Signed-off-by: Matrix YAO <matrix.yao@intel.com> * fix comments Signed-off-by: Matrix YAO <matrix.yao@intel.com> --------- Signed-off-by: Matrix Yao <matrix.yao@intel.com> Signed-off-by: Matrix YAO <matrix.yao@intel.com>
2025-05-28 16:30:16 +08:00
parent cea254c909
commit fb82a98717
3 changed files with 151 additions and 62 deletions
--- a/tests/models/cohere2/test_modeling_cohere2.py
+++ b/tests/models/cohere2/test_modeling_cohere2.py
@@ -23,10 +23,11 @@ from pytest import mark
 from transformers import AutoModelForCausalLM, AutoTokenizer, Cohere2Config, is_torch_available, pipeline
 from transformers.generation.configuration_utils import GenerationConfig
 from transformers.testing_utils import (
+    Expectations,
    require_flash_attn,
    require_read_token,
    require_torch,
-    require_torch_large_gpu,
+    require_torch_large_accelerator,
    slow,
    torch_device,
 )
@@ -130,7 +131,7 @@ class Cohere2ModelTest(CohereModelTest, unittest.TestCase):

@slow
@require_read_token
-@require_torch_large_gpu
+@require_torch_large_accelerator
 class Cohere2IntegrationTest(unittest.TestCase):
    input_text = ["Hello I am doing", "Hi today"]

@@ -155,10 +156,15 @@ class Cohere2IntegrationTest(unittest.TestCase):

    def test_model_fp16(self):
        model_id = "CohereForAI/c4ai-command-r7b-12-2024"
-        EXPECTED_TEXTS = [
-            "<BOS_TOKEN>Hello I am doing a project for a school assignment and I need to create a website for a fictional company. I have",
-            "<PAD><PAD><BOS_TOKEN>Hi today I'm going to show you how to make a simple and easy to make a chocolate cake.\n",
-        ]
+        # fmt: off
+        EXPECTED_TEXTS = Expectations(
+            {
+                ("xpu", 3): ["<BOS_TOKEN>Hello I am doing a project for my school and I need to create a website for a fictional company. I have the", "<PAD><PAD><BOS_TOKEN>Hi today I'm going to show you how to make a simple and easy to make a chocolate cake.\n"],
+                ("cuda", 7): ["<BOS_TOKEN>Hello I am doing a project for a school assignment and I need to create a website for a fictional company. I have", "<PAD><PAD><BOS_TOKEN>Hi today I'm going to show you how to make a simple and easy to make a chocolate cake.\n",],
+            }
+        )
+        EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
+        # fmt: on

        model = AutoModelForCausalLM.from_pretrained(
            model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, attn_implementation="eager"
@@ -170,7 +176,7 @@ class Cohere2IntegrationTest(unittest.TestCase):
        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
        output_text = tokenizer.batch_decode(output, skip_special_tokens=False)

-        self.assertEqual(output_text, EXPECTED_TEXTS)
+        self.assertEqual(output_text, EXPECTED_TEXT)

    def test_model_pipeline_bf16(self):
        # See https://github.com/huggingface/transformers/pull/31747 -- pipeline was broken for Cohere2 before this PR
@@ -223,9 +229,15 @@ class Cohere2IntegrationTest(unittest.TestCase):
        )

        model_id = "CohereForAI/c4ai-command-r7b-12-2024"
-        EXPECTED_TEXT_COMPLETION = [
-            "Hello I am doing a project on the effects of social media on mental health. I have a few questions. 1. What is the relationship",
-        ]
+        # fmt: off
+        EXPECTED_TEXT_COMPLETIONS = Expectations(
+            {
+                ("xpu", 3): ["Hello I am doing a project for a friend and I am stuck on a few things. I have a 2004 Ford F-"],
+                ("cuda", 7): ["Hello I am doing a project on the effects of social media on mental health. I have a few questions. 1. What is the relationship",],
+            }
+        )
+        EXPECTED_TEXT_COMPLETION = EXPECTED_TEXT_COMPLETIONS.get_expectation()
+        # fmt: on

        tokenizer = AutoTokenizer.from_pretrained(model_id, pad_token="<PAD>", padding_side="right")
        # Load model
@@ -270,6 +282,9 @@ class Cohere2IntegrationTest(unittest.TestCase):
        we need to correctly slice the attention mask in all cases (because we use a HybridCache).
        Outputs for every attention functions should be coherent and identical.
        """
+        if torch_device == "xpu" and attn_implementation == "flash_attention_2":
+            self.skipTest(reason="Intel XPU doesn't support falsh_attention_2 as of now.")
+
        model_id = "CohereForAI/c4ai-command-r7b-12-2024"
        EXPECTED_COMPLETIONS = [
            " the mountains, the lakes, the rivers, the waterfalls, the waterfalls, the waterfalls, the waterfalls",