From 1ffc4dee5b0057b5f3de7deea577557b1fa0adb6 Mon Sep 17 00:00:00 2001 From: "Hz, Ji" Date: Mon, 6 Nov 2023 21:44:21 +0800 Subject: [PATCH] enable memory tracker metrics for npu (#27280) --- src/transformers/trainer_utils.py | 15 +++++++++++++++ tests/trainer/test_trainer.py | 6 +++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py index dd793c0203..e6f26d0df5 100644 --- a/src/transformers/trainer_utils.py +++ b/src/transformers/trainer_utils.py @@ -459,6 +459,11 @@ class TrainerMemoryTracker: elif is_torch_xpu_available(): import torch + self.torch = torch + self.gpu = {} + elif is_torch_npu_available(): + import torch + self.torch = torch self.gpu = {} else: @@ -517,6 +522,9 @@ class TrainerMemoryTracker: elif is_torch_xpu_available(): self.torch.xpu.reset_peak_memory_stats() self.torch.xpu.empty_cache() + elif is_torch_npu_available(): + self.torch.npu.reset_peak_memory_stats() + self.torch.npu.empty_cache() # gpu if self.torch is not None: @@ -524,6 +532,8 @@ class TrainerMemoryTracker: self.gpu_mem_used_at_start = self.torch.cuda.memory_allocated() elif is_torch_xpu_available(): self.gpu_mem_used_at_start = self.torch.xpu.memory_allocated() + elif is_torch_npu_available(): + self.gpu_mem_used_at_start = self.torch.npu.memory_allocated() # cpu self.cpu_mem_used_at_start = self.cpu_mem_used() @@ -551,6 +561,8 @@ class TrainerMemoryTracker: self.torch.cuda.empty_cache() elif is_torch_xpu_available(): self.torch.xpu.empty_cache() + elif is_torch_npu_available(): + self.torch.npu.empty_cache() # concepts: # - alloc_delta: the difference of allocated memory between the end and the start @@ -565,6 +577,9 @@ class TrainerMemoryTracker: elif is_torch_xpu_available(): self.gpu_mem_used_now = self.torch.xpu.memory_allocated() self.gpu_mem_used_peak = self.torch.xpu.max_memory_allocated() + elif is_torch_npu_available(): + self.gpu_mem_used_now = self.torch.npu.memory_allocated() + self.gpu_mem_used_peak = self.torch.npu.max_memory_allocated() else: raise ValueError("No available GPU device found!") diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index ae6d8f7ae3..9df5ac84d7 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1944,18 +1944,18 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): metrics = trainer.train().metrics check_func("init_mem_cpu_alloc_delta", metrics) check_func("train_mem_cpu_alloc_delta", metrics) - if torch.cuda.device_count() > 0: + if backend_device_count(torch_device) > 0: check_func("init_mem_gpu_alloc_delta", metrics) check_func("train_mem_gpu_alloc_delta", metrics) metrics = trainer.evaluate() check_func("eval_mem_cpu_alloc_delta", metrics) - if torch.cuda.device_count() > 0: + if backend_device_count(torch_device) > 0: check_func("eval_mem_gpu_alloc_delta", metrics) metrics = trainer.predict(RegressionDataset()).metrics check_func("test_mem_cpu_alloc_delta", metrics) - if torch.cuda.device_count() > 0: + if backend_device_count(torch_device) > 0: check_func("test_mem_gpu_alloc_delta", metrics) def test_mem_metrics(self):