From 3bd1c201496bd39efb05193bf594e36128a55136 Mon Sep 17 00:00:00 2001 From: Yao Matrix Date: Tue, 20 May 2025 16:09:01 +0800 Subject: [PATCH] enable misc cases on XPU & use device agnostic APIs for cases in tests (#38192) * use device agnostic APIs in tests Signed-off-by: Matrix Yao * more Signed-off-by: Matrix Yao * fix style Signed-off-by: Matrix Yao * add reset_peak_memory_stats API Signed-off-by: YAO Matrix * update --------- Signed-off-by: Matrix Yao Signed-off-by: YAO Matrix Co-authored-by: ydshieh --- src/transformers/testing_utils.py | 11 +++++++++++ tests/models/aria/test_modeling_aria.py | 3 ++- .../models/cohere/test_tokenization_cohere.py | 8 ++++++-- tests/models/colpali/test_modeling_colpali.py | 3 ++- .../models/idefics2/test_modeling_idefics2.py | 4 ++-- .../test_modeling_phi4_multimodal.py | 4 ++-- .../qwen2_5_vl/test_modeling_qwen2_5_vl.py | 3 ++- .../models/qwen2_vl/test_modeling_qwen2_vl.py | 3 ++- tests/models/whisper/test_modeling_whisper.py | 8 +++----- tests/tensor_parallel/test_tensor_parallel.py | 4 +++- tests/test_modeling_common.py | 7 ++++--- tests/trainer/test_trainer.py | 19 ++++++++++--------- tests/utils/test_cache_utils.py | 5 +++-- 13 files changed, 52 insertions(+), 30 deletions(-) diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index dd0a2a02d1..094bc4f232 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -3024,6 +3024,11 @@ if is_torch_available(): "cpu": 0, "default": 0, } + BACKEND_RESET_PEAK_MEMORY_STATS = { + "cuda": torch.cuda.reset_peak_memory_stats, + "cpu": None, + "default": None, + } BACKEND_MEMORY_ALLOCATED = { "cuda": torch.cuda.memory_allocated, "cpu": 0, @@ -3044,6 +3049,7 @@ else: BACKEND_EMPTY_CACHE = {"default": None} BACKEND_DEVICE_COUNT = {"default": lambda: 0} BACKEND_RESET_MAX_MEMORY_ALLOCATED = {"default": None} + BACKEND_RESET_PEAK_MEMORY_STATS = {"default": None} BACKEND_MAX_MEMORY_ALLOCATED = {"default": 0} BACKEND_MEMORY_ALLOCATED = {"default": 0} BACKEND_SYNCHRONIZE = {"default": None} @@ -3072,6 +3078,7 @@ if is_torch_xpu_available(): BACKEND_MANUAL_SEED["xpu"] = torch.xpu.manual_seed BACKEND_DEVICE_COUNT["xpu"] = torch.xpu.device_count BACKEND_RESET_MAX_MEMORY_ALLOCATED["xpu"] = torch.xpu.reset_peak_memory_stats + BACKEND_RESET_PEAK_MEMORY_STATS["xpu"] = torch.xpu.reset_peak_memory_stats BACKEND_MAX_MEMORY_ALLOCATED["xpu"] = torch.xpu.max_memory_allocated BACKEND_MEMORY_ALLOCATED["xpu"] = torch.xpu.memory_allocated BACKEND_SYNCHRONIZE["xpu"] = torch.xpu.synchronize @@ -3100,6 +3107,10 @@ def backend_reset_max_memory_allocated(device: str): return _device_agnostic_dispatch(device, BACKEND_RESET_MAX_MEMORY_ALLOCATED) +def backend_reset_peak_memory_stats(device: str): + return _device_agnostic_dispatch(device, BACKEND_RESET_PEAK_MEMORY_STATS) + + def backend_max_memory_allocated(device: str): return _device_agnostic_dispatch(device, BACKEND_MAX_MEMORY_ALLOCATED) diff --git a/tests/models/aria/test_modeling_aria.py b/tests/models/aria/test_modeling_aria.py index 3baa74a8bd..4a86e8e79f 100644 --- a/tests/models/aria/test_modeling_aria.py +++ b/tests/models/aria/test_modeling_aria.py @@ -30,6 +30,7 @@ from transformers import ( ) from transformers.models.idefics3 import Idefics3VisionConfig from transformers.testing_utils import ( + backend_empty_cache, require_bitsandbytes, require_torch, require_torch_large_accelerator, @@ -302,7 +303,7 @@ class AriaForConditionalGenerationIntegrationTest(unittest.TestCase): def tearDown(self): gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) @slow @require_torch_large_accelerator diff --git a/tests/models/cohere/test_tokenization_cohere.py b/tests/models/cohere/test_tokenization_cohere.py index d162b99981..9dd22d7b8b 100644 --- a/tests/models/cohere/test_tokenization_cohere.py +++ b/tests/models/cohere/test_tokenization_cohere.py @@ -17,7 +17,11 @@ import unittest from functools import lru_cache from transformers import CohereTokenizerFast -from transformers.testing_utils import require_jinja, require_tokenizers, require_torch_multi_gpu +from transformers.testing_utils import ( + require_jinja, + require_tokenizers, + require_torch_multi_accelerator, +) from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible @@ -55,7 +59,7 @@ class CohereTokenizationTest(TokenizerTesterMixin, unittest.TestCase): return CohereTokenizerFast.from_pretrained(pretrained_name, **kwargs) # This gives CPU OOM on a single-gpu runner (~60G RAM). On multi-gpu runner, it has ~180G RAM which is enough. - @require_torch_multi_gpu + @require_torch_multi_accelerator def test_torch_encode_plus_sent_to_model(self): super().test_torch_encode_plus_sent_to_model() diff --git a/tests/models/colpali/test_modeling_colpali.py b/tests/models/colpali/test_modeling_colpali.py index 48d1dbf20a..2819592fe0 100644 --- a/tests/models/colpali/test_modeling_colpali.py +++ b/tests/models/colpali/test_modeling_colpali.py @@ -29,6 +29,7 @@ from transformers.models.colpali.configuration_colpali import ColPaliConfig from transformers.models.colpali.modeling_colpali import ColPaliForRetrieval, ColPaliForRetrievalOutput from transformers.models.colpali.processing_colpali import ColPaliProcessor from transformers.testing_utils import ( + backend_empty_cache, require_torch, require_vision, slow, @@ -303,7 +304,7 @@ class ColPaliModelIntegrationTest(unittest.TestCase): def tearDown(self): gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) @slow def test_model_integration_test(self): diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py index 325d971434..eea0c10d4b 100644 --- a/tests/models/idefics2/test_modeling_idefics2.py +++ b/tests/models/idefics2/test_modeling_idefics2.py @@ -35,7 +35,7 @@ from transformers.testing_utils import ( require_flash_attn, require_torch, require_torch_gpu, - require_torch_multi_gpu, + require_torch_multi_accelerator, require_torch_sdpa, slow, torch_device, @@ -583,7 +583,7 @@ class Idefics2ForConditionalGenerationIntegrationTest(unittest.TestCase): cleanup(torch_device, gc_collect=True) @slow - @require_torch_multi_gpu + @require_torch_multi_accelerator def test_integration_test(self): model = Idefics2ForConditionalGeneration.from_pretrained( "HuggingFaceM4/idefics2-8b-base", diff --git a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py index 4dd51cc34e..b29a6b4db9 100644 --- a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py +++ b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py @@ -31,7 +31,7 @@ from transformers import ( is_torch_available, is_vision_available, ) -from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device +from transformers.testing_utils import backend_empty_cache, require_soundfile, require_torch, slow, torch_device from transformers.utils import is_soundfile_available from ...generation.test_utils import GenerationTesterMixin @@ -296,7 +296,7 @@ class Phi4MultimodalIntegrationTest(unittest.TestCase): def tearDown(self): gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_text_only_generation(self): model = AutoModelForCausalLM.from_pretrained( diff --git a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py index 5f06e1c84b..3a0f6458ad 100644 --- a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py +++ b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py @@ -29,6 +29,7 @@ from transformers import ( is_vision_available, ) from transformers.testing_utils import ( + backend_empty_cache, is_flaky, require_cv2, require_flash_attn, @@ -421,7 +422,7 @@ class Qwen2_5_VLIntegrationTest(unittest.TestCase): def tearDown(self): gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) @slow def test_small_model_integration_test(self): diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index 57e112790c..92b6d7f87f 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -28,6 +28,7 @@ from transformers import ( is_vision_available, ) from transformers.testing_utils import ( + backend_empty_cache, require_flash_attn, require_torch, require_torch_gpu, @@ -367,7 +368,7 @@ class Qwen2VLIntegrationTest(unittest.TestCase): def tearDown(self): gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) @slow def test_small_model_integration_test(self): diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index 6aa31e7206..0446fb2052 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -32,7 +32,6 @@ from transformers import WhisperConfig from transformers.testing_utils import ( is_flaky, require_flash_attn, - require_non_xpu, require_torch, require_torch_accelerator, require_torch_fp16, @@ -42,7 +41,7 @@ from transformers.testing_utils import ( slow, torch_device, ) -from transformers.utils import cached_property, is_torch_available, is_torchaudio_available +from transformers.utils import cached_property, is_torch_available, is_torch_xpu_available, is_torchaudio_available from transformers.utils.import_utils import is_datasets_available from ...generation.test_utils import GenerationTesterMixin @@ -2431,11 +2430,10 @@ class WhisperModelIntegrationTests(unittest.TestCase): " How many different species are there in the chilli? How many different species are there in the chilli?", ) - @require_non_xpu @slow - @require_torch_gpu + @require_torch_accelerator def test_speculative_decoding_distil(self): - torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 + torch_dtype = torch.float16 if (torch.cuda.is_available() or is_torch_xpu_available()) else torch.float32 model_id = "openai/whisper-large-v2" model = WhisperForConditionalGeneration.from_pretrained( model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True diff --git a/tests/tensor_parallel/test_tensor_parallel.py b/tests/tensor_parallel/test_tensor_parallel.py index a8ca732635..6174a61329 100644 --- a/tests/tensor_parallel/test_tensor_parallel.py +++ b/tests/tensor_parallel/test_tensor_parallel.py @@ -21,9 +21,11 @@ from transformers import is_torch_available from transformers.integrations.tensor_parallel import get_packed_weights, repack_weights from transformers.testing_utils import ( TestCasePlus, + backend_device_count, get_torch_dist_unique_port, require_huggingface_hub_greater_or_equal, require_torch_multi_gpu, + torch_device, ) @@ -168,4 +170,4 @@ class TestTensorParallel(TestCasePlus): @require_torch_multi_gpu class TestTensorParallelCuda(TestTensorParallel): - nproc_per_node = torch.cuda.device_count() + nproc_per_node = backend_device_count(torch_device) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 338047e95a..67da1ad857 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -73,6 +73,7 @@ from transformers.models.auto.modeling_auto import ( ) from transformers.testing_utils import ( CaptureLogger, + backend_empty_cache, get_device_properties, hub_retry, is_flaky, @@ -2652,7 +2653,7 @@ class ModelTesterMixin: config = self.model_tester.get_large_model_config() for model_class in self.all_parallelizable_model_classes: - torch.cuda.empty_cache() + backend_empty_cache(torch_device) # 1. single gpu memory load + unload + memory measurements # Retrieve initial memory usage (can easily be ~0.6-1.5GB if cuda-kernels have been preloaded by previous tests) @@ -2668,7 +2669,7 @@ class ModelTesterMixin: del model gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) # 2. MP test # it's essential to re-calibrate the usage before the next stage @@ -2692,7 +2693,7 @@ class ModelTesterMixin: del model gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) @require_torch_gpu @require_torch_multi_gpu diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index b5fb3d64e7..21b8622473 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -66,6 +66,7 @@ from transformers.testing_utils import ( backend_max_memory_allocated, backend_memory_allocated, backend_reset_max_memory_allocated, + backend_reset_peak_memory_stats, evaluate_side_effect_factory, execute_subprocess_async, get_gpu_count, @@ -1654,7 +1655,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): self.assertFalse(is_any_loss_nan_or_inf(log_history_filter)) def test_train_and_eval_dataloaders(self): - if torch_device == "cuda": + if torch_device in ["cuda", "xpu"]: n_gpu = max(1, backend_device_count(torch_device)) else: n_gpu = 1 @@ -4106,7 +4107,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): mod = MyModule() # 1. without TorchDynamo (eager baseline) - a = torch.ones(1024, 1024, device="cuda", requires_grad=True) + a = torch.ones(1024, 1024, device=torch_device, requires_grad=True) a.grad = None trainer = CustomTrainer(model=mod) # warmup @@ -4115,17 +4116,17 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): # resets gc.collect() - torch.cuda.empty_cache() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_peak_memory_stats(torch_device) orig_loss = trainer.training_step(mod, {"x": a}) - orig_peak_mem = torch.cuda.max_memory_allocated() + orig_peak_mem = backend_max_memory_allocated(torch_device) torchdynamo.reset() del trainer # 2. TorchDynamo nvfuser with tempfile.TemporaryDirectory() as tmp_dir: - a = torch.ones(1024, 1024, device="cuda", requires_grad=True) + a = torch.ones(1024, 1024, device=torch_device, requires_grad=True) a.grad = None args = TrainingArguments(output_dir=tmp_dir, torch_compile_backend="nvfuser") trainer = CustomTrainer(model=mod, args=args) @@ -4135,11 +4136,11 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): # resets gc.collect() - torch.cuda.empty_cache() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_peak_memory_stats(torch_device) loss = trainer.training_step(mod, {"x": a}) - peak_mem = torch.cuda.max_memory_allocated() + peak_mem = backend_max_memory_allocated(torch_device) torchdynamo.reset() del trainer diff --git a/tests/utils/test_cache_utils.py b/tests/utils/test_cache_utils.py index 243ae657c1..089b45c192 100644 --- a/tests/utils/test_cache_utils.py +++ b/tests/utils/test_cache_utils.py @@ -21,6 +21,7 @@ from transformers import set_seed from transformers.generation.configuration_utils import ALL_CACHE_IMPLEMENTATIONS from transformers.testing_utils import ( CaptureStderr, + backend_device_count, cleanup, get_gpu_count, is_torch_available, @@ -210,8 +211,8 @@ class CacheIntegrationTest(unittest.TestCase): if not has_accelerator: self.skipTest("Offloaded caches require an accelerator") if cache_implementation in ["offloaded_static", "offloaded_hybrid_chunked"]: - if torch.cuda.device_count() != 1: - self.skipTest("Offloaded static caches require exactly 1 GPU") + if backend_device_count(torch_device) != 1: + self.skipTest("Offloaded static caches require exactly 1 accelerator") @parameterized.expand(TEST_CACHE_IMPLEMENTATIONS) def test_cache_batched(self, cache_implementation):