From 9167461a7db39a1a7c793bb46c5aed9e14e8492e Mon Sep 17 00:00:00 2001
From: Yao Matrix <matrix.yao@intel.com>
Date: Tue, 22 Apr 2025 23:39:10 +0800
Subject: [PATCH] enable mllama cases on xpu (#37644)

* enable mllama testing on xpu

Signed-off-by: YAO Matrix <matrix.yao@intel.com>

* more mllama cases enabling

Signed-off-by: YAO Matrix <matrix.yao@intel.com>

* make cases pass on A100

Signed-off-by: N <matrix.yao@intel.com>

---------

Signed-off-by: YAO Matrix <matrix.yao@intel.com>
Signed-off-by: N <matrix.yao@intel.com>
---
 tests/models/mllama/test_modeling_mllama.py | 85 +++++++++++++++++----
 1 file changed, 69 insertions(+), 16 deletions(-)

diff --git a/tests/models/mllama/test_modeling_mllama.py b/tests/models/mllama/test_modeling_mllama.py
index 6308f6d4c0..41ad23d7d7 100644
--- a/tests/models/mllama/test_modeling_mllama.py
+++ b/tests/models/mllama/test_modeling_mllama.py
@@ -31,11 +31,12 @@ from transformers import (
 from transformers.cache_utils import Cache
 from transformers.models.mllama.configuration_mllama import MllamaTextConfig
 from transformers.testing_utils import (
+    Expectations,
     cleanup,
     require_bitsandbytes,
     require_read_token,
     require_torch,
-    require_torch_gpu,
+    require_torch_accelerator,
     slow,
     torch_device,
 )
@@ -524,7 +525,7 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
         cleanup(torch_device, gc_collect=True)
 
     @slow
-    @require_torch_gpu
+    @require_torch_accelerator
     @require_bitsandbytes
     @require_read_token
     def test_11b_model_integration_generate(self):
@@ -537,9 +538,18 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
 
         inputs = processor(text=prompt, images=image, return_tensors="pt").to(torch_device)
 
+        input_ids = inputs["input_ids"]
+
         # Check inputs ids
-        expected_input_ids = torch.tensor([[128256, 128000, 2746, 358, 1047, 311, 3350, 264, 6520, 39342, 369, 420, 832]], device=torch_device)  # fmt: skip
-        self.assertTrue(torch.equal(inputs["input_ids"], expected_input_ids))
+        expected_input_ids_all = Expectations(
+            {
+                ("xpu", 3): torch.tensor([[128000, 128256, 128000, 2746, 358, 1047, 311, 3350, 264, 6520, 39342, 369, 420, 832]], device=torch_device),
+                ("cuda", 7): torch.tensor([[128256, 128000, 2746, 358, 1047, 311, 3350, 264, 6520, 39342, 369, 420, 832]], device=torch_device),
+                ("cuda", 8): torch.tensor([[128000, 128256, 128000, 2746, 358, 1047, 311, 3350, 264, 6520, 39342, 369, 420, 832]], device=torch_device),
+            }
+        )  # fmt: skip
+        expected_input_ids = expected_input_ids_all.get_expectation()
+        self.assertTrue(torch.equal(input_ids, expected_input_ids))
 
         # Load model in 4 bit
         quantization_config = BitsAndBytesConfig(load_in_4bit=True)
@@ -551,7 +561,14 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
         output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
 
         decoded_output = processor.decode(output[0], skip_special_tokens=True)
-        expected_output = "If I had to write a haiku for this one, it would be:.\\nI'm not a poet.\\nBut I'm a photographer.\\nAnd I'm a"  # fmt: skip
+        expected_outputs = Expectations(
+                {
+                    ("xpu", 3): "If I had to write a haiku for this one, it would be:.\\nA dock on a lake.\\nA mountain in the distance.\\nA long exposure.",
+                    ("cuda", 7): "If I had to write a haiku for this one, it would be:.\\nI'm not a poet.\\nBut I'm a photographer.\\nAnd I'm a",
+                    ("cuda", 8): "If I had to write a haiku for this one, it would be:.\\nA dock on a lake.\\nA mountain in the distance.\\nA long exposure.",
+                }
+            )  # fmt: skip
+        expected_output = expected_outputs.get_expectation()
 
         self.assertEqual(
             decoded_output,
@@ -560,7 +577,7 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
         )
 
     @slow
-    @require_torch_gpu
+    @require_torch_accelerator
     @require_bitsandbytes
     @require_read_token
     def test_11b_model_integration_generate_text_only(self):
@@ -568,10 +585,18 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
         processor = AutoProcessor.from_pretrained(self.base_model_checkpoint)
         prompt = "If I had to write a haiku"
         inputs = processor(text=prompt, return_tensors="pt").to(torch_device)
+        input_ids = inputs["input_ids"].cpu().squeeze().tolist()
 
         # Check inputs ids
-        expected_input_ids = [128000, 2746, 358, 1047, 311, 3350, 264, 6520, 39342]
-        self.assertEqual(inputs["input_ids"].cpu().squeeze().tolist(), expected_input_ids)
+        expected_input_ids_all = Expectations(
+            {
+                ("xpu", 3): [128000, 128000, 2746, 358, 1047, 311, 3350, 264, 6520, 39342],
+                ("cuda", 7): [128000, 2746, 358, 1047, 311, 3350, 264, 6520, 39342],
+                ("cuda", 8): [128000, 128000, 2746, 358, 1047, 311, 3350, 264, 6520, 39342],
+            }
+        )
+        expected_input_ids = expected_input_ids_all.get_expectation()
+        self.assertEqual(input_ids, expected_input_ids)
 
         # Load model in 4 bit
         quantization_config = BitsAndBytesConfig(load_in_4bit=True)
@@ -583,8 +608,14 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
         output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
 
         decoded_output = processor.decode(output[0], skip_special_tokens=True)
-        expected_output = "If I had to write a haiku about my life, I think it would be something like:\n\"Life is a messy stream\nTwists and turns, ups"  # fmt: skip
-
+        expected_outputs = Expectations(
+                {
+                    ("xpu", 3): "If I had to write a haiku about my life, I would write:\nLife is a messy tapestry\n Threads of joy and sorrow\nWeft of memories",
+                    ("cuda", 7): "If I had to write a haiku about my life, I think it would be something like:\n\"Life is a messy stream\nTwists and turns, ups",
+                    ("cuda", 8): "If I had to write a haiku about my life, I would write:\nLife is a messy stream\nRipples of joy and pain\nFlowing, ever",
+                }
+            )  # fmt: skip
+        expected_output = expected_outputs.get_expectation()
         self.assertEqual(
             decoded_output,
             expected_output,
@@ -592,7 +623,7 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
         )
 
     @slow
-    @require_torch_gpu
+    @require_torch_accelerator
     @require_bitsandbytes
     @require_read_token
     def test_11b_model_integration_forward(self):
@@ -616,7 +647,15 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
             output = model(**inputs)
 
         actual_logits = output.logits[0, -1, :5].cpu()
-        expected_logits = torch.tensor([8.3594, 7.7148, 4.7266, 0.7803, 3.1504])
+        expected_logits_all = Expectations(
+            {
+                ("xpu", 3): torch.tensor([9.1562, 8.9141, 5.0664, 1.6855, 3.2324]),
+                ("cuda", 7): torch.tensor([8.3594, 7.7148, 4.7266, 0.7803, 3.1504]),
+                ("cuda", 8): torch.tensor([9.0703, 8.8750, 5.0781, 1.6279, 3.2207]),
+            }
+        )
+
+        expected_logits = expected_logits_all.get_expectation()
         self.assertTrue(
             torch.allclose(actual_logits, expected_logits, atol=0.1),
             f"Actual logits: {actual_logits}"
@@ -625,7 +664,7 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
         )
 
     @slow
-    @require_torch_gpu
+    @require_torch_accelerator
     @require_bitsandbytes
     @require_read_token
     def test_11b_model_integration_batched_generate(self):
@@ -653,7 +692,14 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
 
         # Check first output
         decoded_output = processor.decode(output[0], skip_special_tokens=True)
-        expected_output = "If I had to write a haiku for this one, it would be:.\\nI'm not a poet.\\nBut I'm a photographer.\\nAnd I'm a"  # fmt: skip
+        expected_outputs = Expectations(
+                {
+                    ("xpu", 3): "If I had to write a haiku for this one, it would be:.\\nA dock on a lake.\\nA mountain in the distance.\\nA long exposure.",
+                    ("cuda", 7): "If I had to write a haiku for this one, it would be:.\\nI'm not a poet.\\nBut I'm a photographer.\\nAnd I'm a",
+                    ("cuda", 8): "If I had to write a haiku for this one, it would be:.\\nA dock on a lake.\\nA mountain in the distance.\\nA long exposure.",
+                 }
+            )  # fmt: skip
+        expected_output = expected_outputs.get_expectation()
 
         self.assertEqual(
             decoded_output,
@@ -663,7 +709,14 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
 
         # Check second output
         decoded_output = processor.decode(output[1], skip_special_tokens=True)
-        expected_output = "This image shows is a photograph of a stop sign in front of a Chinese archway. The stop sign is red with white letters and is"  # fmt: skip
+        expected_outputs = Expectations(
+                {
+                    ("xpu", 3): "This image shows\nI'm not able to provide information on the person in this image. I can give you an idea of what's happening",
+                    ("cuda", 7): "This image shows is a photograph of a stop sign in front of a Chinese archway. The stop sign is red with white letters and is",
+                    ("cuda", 8): "This image shows\nI'm not able to provide information on the person in this image. I can give you an idea of what's happening",
+                }
+            )  # fmt: skip
+        expected_output = expected_outputs.get_expectation()
 
         self.assertEqual(
             decoded_output,
@@ -672,7 +725,7 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
         )
 
     @slow
-    @require_torch_gpu
+    @require_torch_accelerator
     @require_bitsandbytes
     @require_read_token
     def test_11b_model_integration_multi_image_generate(self):