From 3bd1c201496bd39efb05193bf594e36128a55136 Mon Sep 17 00:00:00 2001
From: Yao Matrix <matrix.yao@intel.com>
Date: Tue, 20 May 2025 16:09:01 +0800
Subject: [PATCH] enable misc cases on XPU & use device agnostic APIs for cases
 in tests (#38192)

* use device agnostic APIs in tests

Signed-off-by: Matrix Yao <matrix.yao@intel.com>

* more

Signed-off-by: Matrix Yao <matrix.yao@intel.com>

* fix style

Signed-off-by: Matrix Yao <matrix.yao@intel.com>

* add reset_peak_memory_stats API

Signed-off-by: YAO Matrix <matrix.yao@intel.com>

* update

---------

Signed-off-by: Matrix Yao <matrix.yao@intel.com>
Signed-off-by: YAO Matrix <matrix.yao@intel.com>
Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/testing_utils.py             | 11 +++++++++++
 tests/models/aria/test_modeling_aria.py       |  3 ++-
 .../models/cohere/test_tokenization_cohere.py |  8 ++++++--
 tests/models/colpali/test_modeling_colpali.py |  3 ++-
 .../models/idefics2/test_modeling_idefics2.py |  4 ++--
 .../test_modeling_phi4_multimodal.py          |  4 ++--
 .../qwen2_5_vl/test_modeling_qwen2_5_vl.py    |  3 ++-
 .../models/qwen2_vl/test_modeling_qwen2_vl.py |  3 ++-
 tests/models/whisper/test_modeling_whisper.py |  8 +++-----
 tests/tensor_parallel/test_tensor_parallel.py |  4 +++-
 tests/test_modeling_common.py                 |  7 ++++---
 tests/trainer/test_trainer.py                 | 19 ++++++++++---------
 tests/utils/test_cache_utils.py               |  5 +++--
 13 files changed, 52 insertions(+), 30 deletions(-)

diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index dd0a2a02d1..094bc4f232 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -3024,6 +3024,11 @@ if is_torch_available():
         "cpu": 0,
         "default": 0,
     }
+    BACKEND_RESET_PEAK_MEMORY_STATS = {
+        "cuda": torch.cuda.reset_peak_memory_stats,
+        "cpu": None,
+        "default": None,
+    }
     BACKEND_MEMORY_ALLOCATED = {
         "cuda": torch.cuda.memory_allocated,
         "cpu": 0,
@@ -3044,6 +3049,7 @@ else:
     BACKEND_EMPTY_CACHE = {"default": None}
     BACKEND_DEVICE_COUNT = {"default": lambda: 0}
     BACKEND_RESET_MAX_MEMORY_ALLOCATED = {"default": None}
+    BACKEND_RESET_PEAK_MEMORY_STATS = {"default": None}
     BACKEND_MAX_MEMORY_ALLOCATED = {"default": 0}
     BACKEND_MEMORY_ALLOCATED = {"default": 0}
     BACKEND_SYNCHRONIZE = {"default": None}
@@ -3072,6 +3078,7 @@ if is_torch_xpu_available():
     BACKEND_MANUAL_SEED["xpu"] = torch.xpu.manual_seed
     BACKEND_DEVICE_COUNT["xpu"] = torch.xpu.device_count
     BACKEND_RESET_MAX_MEMORY_ALLOCATED["xpu"] = torch.xpu.reset_peak_memory_stats
+    BACKEND_RESET_PEAK_MEMORY_STATS["xpu"] = torch.xpu.reset_peak_memory_stats
     BACKEND_MAX_MEMORY_ALLOCATED["xpu"] = torch.xpu.max_memory_allocated
     BACKEND_MEMORY_ALLOCATED["xpu"] = torch.xpu.memory_allocated
     BACKEND_SYNCHRONIZE["xpu"] = torch.xpu.synchronize
@@ -3100,6 +3107,10 @@ def backend_reset_max_memory_allocated(device: str):
     return _device_agnostic_dispatch(device, BACKEND_RESET_MAX_MEMORY_ALLOCATED)
 
 
+def backend_reset_peak_memory_stats(device: str):
+    return _device_agnostic_dispatch(device, BACKEND_RESET_PEAK_MEMORY_STATS)
+
+
 def backend_max_memory_allocated(device: str):
     return _device_agnostic_dispatch(device, BACKEND_MAX_MEMORY_ALLOCATED)
 
diff --git a/tests/models/aria/test_modeling_aria.py b/tests/models/aria/test_modeling_aria.py
index 3baa74a8bd..4a86e8e79f 100644
--- a/tests/models/aria/test_modeling_aria.py
+++ b/tests/models/aria/test_modeling_aria.py
@@ -30,6 +30,7 @@ from transformers import (
 )
 from transformers.models.idefics3 import Idefics3VisionConfig
 from transformers.testing_utils import (
+    backend_empty_cache,
     require_bitsandbytes,
     require_torch,
     require_torch_large_accelerator,
@@ -302,7 +303,7 @@ class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
 
     def tearDown(self):
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     @slow
     @require_torch_large_accelerator
diff --git a/tests/models/cohere/test_tokenization_cohere.py b/tests/models/cohere/test_tokenization_cohere.py
index d162b99981..9dd22d7b8b 100644
--- a/tests/models/cohere/test_tokenization_cohere.py
+++ b/tests/models/cohere/test_tokenization_cohere.py
@@ -17,7 +17,11 @@ import unittest
 from functools import lru_cache
 
 from transformers import CohereTokenizerFast
-from transformers.testing_utils import require_jinja, require_tokenizers, require_torch_multi_gpu
+from transformers.testing_utils import (
+    require_jinja,
+    require_tokenizers,
+    require_torch_multi_accelerator,
+)
 
 from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
@@ -55,7 +59,7 @@ class CohereTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         return CohereTokenizerFast.from_pretrained(pretrained_name, **kwargs)
 
     # This gives CPU OOM on a single-gpu runner (~60G RAM). On multi-gpu runner, it has ~180G RAM which is enough.
-    @require_torch_multi_gpu
+    @require_torch_multi_accelerator
     def test_torch_encode_plus_sent_to_model(self):
         super().test_torch_encode_plus_sent_to_model()
 
diff --git a/tests/models/colpali/test_modeling_colpali.py b/tests/models/colpali/test_modeling_colpali.py
index 48d1dbf20a..2819592fe0 100644
--- a/tests/models/colpali/test_modeling_colpali.py
+++ b/tests/models/colpali/test_modeling_colpali.py
@@ -29,6 +29,7 @@ from transformers.models.colpali.configuration_colpali import ColPaliConfig
 from transformers.models.colpali.modeling_colpali import ColPaliForRetrieval, ColPaliForRetrievalOutput
 from transformers.models.colpali.processing_colpali import ColPaliProcessor
 from transformers.testing_utils import (
+    backend_empty_cache,
     require_torch,
     require_vision,
     slow,
@@ -303,7 +304,7 @@ class ColPaliModelIntegrationTest(unittest.TestCase):
 
     def tearDown(self):
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     @slow
     def test_model_integration_test(self):
diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py
index 325d971434..eea0c10d4b 100644
--- a/tests/models/idefics2/test_modeling_idefics2.py
+++ b/tests/models/idefics2/test_modeling_idefics2.py
@@ -35,7 +35,7 @@ from transformers.testing_utils import (
     require_flash_attn,
     require_torch,
     require_torch_gpu,
-    require_torch_multi_gpu,
+    require_torch_multi_accelerator,
     require_torch_sdpa,
     slow,
     torch_device,
@@ -583,7 +583,7 @@ class Idefics2ForConditionalGenerationIntegrationTest(unittest.TestCase):
         cleanup(torch_device, gc_collect=True)
 
     @slow
-    @require_torch_multi_gpu
+    @require_torch_multi_accelerator
     def test_integration_test(self):
         model = Idefics2ForConditionalGeneration.from_pretrained(
             "HuggingFaceM4/idefics2-8b-base",
diff --git a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py
index 4dd51cc34e..b29a6b4db9 100644
--- a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py
+++ b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py
@@ -31,7 +31,7 @@ from transformers import (
     is_torch_available,
     is_vision_available,
 )
-from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device
+from transformers.testing_utils import backend_empty_cache, require_soundfile, require_torch, slow, torch_device
 from transformers.utils import is_soundfile_available
 
 from ...generation.test_utils import GenerationTesterMixin
@@ -296,7 +296,7 @@ class Phi4MultimodalIntegrationTest(unittest.TestCase):
 
     def tearDown(self):
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def test_text_only_generation(self):
         model = AutoModelForCausalLM.from_pretrained(
diff --git a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
index 5f06e1c84b..3a0f6458ad 100644
--- a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
+++ b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
@@ -29,6 +29,7 @@ from transformers import (
     is_vision_available,
 )
 from transformers.testing_utils import (
+    backend_empty_cache,
     is_flaky,
     require_cv2,
     require_flash_attn,
@@ -421,7 +422,7 @@ class Qwen2_5_VLIntegrationTest(unittest.TestCase):
 
     def tearDown(self):
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     @slow
     def test_small_model_integration_test(self):
diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
index 57e112790c..92b6d7f87f 100644
--- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
@@ -28,6 +28,7 @@ from transformers import (
     is_vision_available,
 )
 from transformers.testing_utils import (
+    backend_empty_cache,
     require_flash_attn,
     require_torch,
     require_torch_gpu,
@@ -367,7 +368,7 @@ class Qwen2VLIntegrationTest(unittest.TestCase):
 
     def tearDown(self):
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     @slow
     def test_small_model_integration_test(self):
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 6aa31e7206..0446fb2052 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -32,7 +32,6 @@ from transformers import WhisperConfig
 from transformers.testing_utils import (
     is_flaky,
     require_flash_attn,
-    require_non_xpu,
     require_torch,
     require_torch_accelerator,
     require_torch_fp16,
@@ -42,7 +41,7 @@ from transformers.testing_utils import (
     slow,
     torch_device,
 )
-from transformers.utils import cached_property, is_torch_available, is_torchaudio_available
+from transformers.utils import cached_property, is_torch_available, is_torch_xpu_available, is_torchaudio_available
 from transformers.utils.import_utils import is_datasets_available
 
 from ...generation.test_utils import GenerationTesterMixin
@@ -2431,11 +2430,10 @@ class WhisperModelIntegrationTests(unittest.TestCase):
             " How many different species are there in the chilli? How many different species are there in the chilli?",
         )
 
-    @require_non_xpu
     @slow
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_speculative_decoding_distil(self):
-        torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+        torch_dtype = torch.float16 if (torch.cuda.is_available() or is_torch_xpu_available()) else torch.float32
         model_id = "openai/whisper-large-v2"
         model = WhisperForConditionalGeneration.from_pretrained(
             model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
diff --git a/tests/tensor_parallel/test_tensor_parallel.py b/tests/tensor_parallel/test_tensor_parallel.py
index a8ca732635..6174a61329 100644
--- a/tests/tensor_parallel/test_tensor_parallel.py
+++ b/tests/tensor_parallel/test_tensor_parallel.py
@@ -21,9 +21,11 @@ from transformers import is_torch_available
 from transformers.integrations.tensor_parallel import get_packed_weights, repack_weights
 from transformers.testing_utils import (
     TestCasePlus,
+    backend_device_count,
     get_torch_dist_unique_port,
     require_huggingface_hub_greater_or_equal,
     require_torch_multi_gpu,
+    torch_device,
 )
 
 
@@ -168,4 +170,4 @@ class TestTensorParallel(TestCasePlus):
 
 @require_torch_multi_gpu
 class TestTensorParallelCuda(TestTensorParallel):
-    nproc_per_node = torch.cuda.device_count()
+    nproc_per_node = backend_device_count(torch_device)
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 338047e95a..67da1ad857 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -73,6 +73,7 @@ from transformers.models.auto.modeling_auto import (
 )
 from transformers.testing_utils import (
     CaptureLogger,
+    backend_empty_cache,
     get_device_properties,
     hub_retry,
     is_flaky,
@@ -2652,7 +2653,7 @@ class ModelTesterMixin:
         config = self.model_tester.get_large_model_config()
 
         for model_class in self.all_parallelizable_model_classes:
-            torch.cuda.empty_cache()
+            backend_empty_cache(torch_device)
 
             # 1. single gpu memory load + unload + memory measurements
             # Retrieve initial memory usage (can easily be ~0.6-1.5GB if cuda-kernels have been preloaded by previous tests)
@@ -2668,7 +2669,7 @@ class ModelTesterMixin:
 
             del model
             gc.collect()
-            torch.cuda.empty_cache()
+            backend_empty_cache(torch_device)
 
             # 2. MP test
             # it's essential to re-calibrate the usage before the next stage
@@ -2692,7 +2693,7 @@ class ModelTesterMixin:
 
             del model
             gc.collect()
-            torch.cuda.empty_cache()
+            backend_empty_cache(torch_device)
 
     @require_torch_gpu
     @require_torch_multi_gpu
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index b5fb3d64e7..21b8622473 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -66,6 +66,7 @@ from transformers.testing_utils import (
     backend_max_memory_allocated,
     backend_memory_allocated,
     backend_reset_max_memory_allocated,
+    backend_reset_peak_memory_stats,
     evaluate_side_effect_factory,
     execute_subprocess_async,
     get_gpu_count,
@@ -1654,7 +1655,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         self.assertFalse(is_any_loss_nan_or_inf(log_history_filter))
 
     def test_train_and_eval_dataloaders(self):
-        if torch_device == "cuda":
+        if torch_device in ["cuda", "xpu"]:
             n_gpu = max(1, backend_device_count(torch_device))
         else:
             n_gpu = 1
@@ -4106,7 +4107,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         mod = MyModule()
 
         # 1. without TorchDynamo (eager baseline)
-        a = torch.ones(1024, 1024, device="cuda", requires_grad=True)
+        a = torch.ones(1024, 1024, device=torch_device, requires_grad=True)
         a.grad = None
         trainer = CustomTrainer(model=mod)
         # warmup
@@ -4115,17 +4116,17 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
 
         # resets
         gc.collect()
-        torch.cuda.empty_cache()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_peak_memory_stats(torch_device)
 
         orig_loss = trainer.training_step(mod, {"x": a})
-        orig_peak_mem = torch.cuda.max_memory_allocated()
+        orig_peak_mem = backend_max_memory_allocated(torch_device)
         torchdynamo.reset()
         del trainer
 
         # 2. TorchDynamo nvfuser
         with tempfile.TemporaryDirectory() as tmp_dir:
-            a = torch.ones(1024, 1024, device="cuda", requires_grad=True)
+            a = torch.ones(1024, 1024, device=torch_device, requires_grad=True)
             a.grad = None
             args = TrainingArguments(output_dir=tmp_dir, torch_compile_backend="nvfuser")
             trainer = CustomTrainer(model=mod, args=args)
@@ -4135,11 +4136,11 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
 
             # resets
             gc.collect()
-            torch.cuda.empty_cache()
-            torch.cuda.reset_peak_memory_stats()
+            backend_empty_cache(torch_device)
+            backend_reset_peak_memory_stats(torch_device)
 
             loss = trainer.training_step(mod, {"x": a})
-            peak_mem = torch.cuda.max_memory_allocated()
+            peak_mem = backend_max_memory_allocated(torch_device)
             torchdynamo.reset()
             del trainer
 
diff --git a/tests/utils/test_cache_utils.py b/tests/utils/test_cache_utils.py
index 243ae657c1..089b45c192 100644
--- a/tests/utils/test_cache_utils.py
+++ b/tests/utils/test_cache_utils.py
@@ -21,6 +21,7 @@ from transformers import set_seed
 from transformers.generation.configuration_utils import ALL_CACHE_IMPLEMENTATIONS
 from transformers.testing_utils import (
     CaptureStderr,
+    backend_device_count,
     cleanup,
     get_gpu_count,
     is_torch_available,
@@ -210,8 +211,8 @@ class CacheIntegrationTest(unittest.TestCase):
             if not has_accelerator:
                 self.skipTest("Offloaded caches require an accelerator")
             if cache_implementation in ["offloaded_static", "offloaded_hybrid_chunked"]:
-                if torch.cuda.device_count() != 1:
-                    self.skipTest("Offloaded static caches require exactly 1 GPU")
+                if backend_device_count(torch_device) != 1:
+                    self.skipTest("Offloaded static caches require exactly 1 accelerator")
 
     @parameterized.expand(TEST_CACHE_IMPLEMENTATIONS)
     def test_cache_batched(self, cache_implementation):