enable memory tracker metrics for npu (#27280)
This commit is contained in:
@@ -459,6 +459,11 @@ class TrainerMemoryTracker:
|
|||||||
elif is_torch_xpu_available():
|
elif is_torch_xpu_available():
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
self.torch = torch
|
||||||
|
self.gpu = {}
|
||||||
|
elif is_torch_npu_available():
|
||||||
|
import torch
|
||||||
|
|
||||||
self.torch = torch
|
self.torch = torch
|
||||||
self.gpu = {}
|
self.gpu = {}
|
||||||
else:
|
else:
|
||||||
@@ -517,6 +522,9 @@ class TrainerMemoryTracker:
|
|||||||
elif is_torch_xpu_available():
|
elif is_torch_xpu_available():
|
||||||
self.torch.xpu.reset_peak_memory_stats()
|
self.torch.xpu.reset_peak_memory_stats()
|
||||||
self.torch.xpu.empty_cache()
|
self.torch.xpu.empty_cache()
|
||||||
|
elif is_torch_npu_available():
|
||||||
|
self.torch.npu.reset_peak_memory_stats()
|
||||||
|
self.torch.npu.empty_cache()
|
||||||
|
|
||||||
# gpu
|
# gpu
|
||||||
if self.torch is not None:
|
if self.torch is not None:
|
||||||
@@ -524,6 +532,8 @@ class TrainerMemoryTracker:
|
|||||||
self.gpu_mem_used_at_start = self.torch.cuda.memory_allocated()
|
self.gpu_mem_used_at_start = self.torch.cuda.memory_allocated()
|
||||||
elif is_torch_xpu_available():
|
elif is_torch_xpu_available():
|
||||||
self.gpu_mem_used_at_start = self.torch.xpu.memory_allocated()
|
self.gpu_mem_used_at_start = self.torch.xpu.memory_allocated()
|
||||||
|
elif is_torch_npu_available():
|
||||||
|
self.gpu_mem_used_at_start = self.torch.npu.memory_allocated()
|
||||||
|
|
||||||
# cpu
|
# cpu
|
||||||
self.cpu_mem_used_at_start = self.cpu_mem_used()
|
self.cpu_mem_used_at_start = self.cpu_mem_used()
|
||||||
@@ -551,6 +561,8 @@ class TrainerMemoryTracker:
|
|||||||
self.torch.cuda.empty_cache()
|
self.torch.cuda.empty_cache()
|
||||||
elif is_torch_xpu_available():
|
elif is_torch_xpu_available():
|
||||||
self.torch.xpu.empty_cache()
|
self.torch.xpu.empty_cache()
|
||||||
|
elif is_torch_npu_available():
|
||||||
|
self.torch.npu.empty_cache()
|
||||||
|
|
||||||
# concepts:
|
# concepts:
|
||||||
# - alloc_delta: the difference of allocated memory between the end and the start
|
# - alloc_delta: the difference of allocated memory between the end and the start
|
||||||
@@ -565,6 +577,9 @@ class TrainerMemoryTracker:
|
|||||||
elif is_torch_xpu_available():
|
elif is_torch_xpu_available():
|
||||||
self.gpu_mem_used_now = self.torch.xpu.memory_allocated()
|
self.gpu_mem_used_now = self.torch.xpu.memory_allocated()
|
||||||
self.gpu_mem_used_peak = self.torch.xpu.max_memory_allocated()
|
self.gpu_mem_used_peak = self.torch.xpu.max_memory_allocated()
|
||||||
|
elif is_torch_npu_available():
|
||||||
|
self.gpu_mem_used_now = self.torch.npu.memory_allocated()
|
||||||
|
self.gpu_mem_used_peak = self.torch.npu.max_memory_allocated()
|
||||||
else:
|
else:
|
||||||
raise ValueError("No available GPU device found!")
|
raise ValueError("No available GPU device found!")
|
||||||
|
|
||||||
|
|||||||
@@ -1944,18 +1944,18 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
metrics = trainer.train().metrics
|
metrics = trainer.train().metrics
|
||||||
check_func("init_mem_cpu_alloc_delta", metrics)
|
check_func("init_mem_cpu_alloc_delta", metrics)
|
||||||
check_func("train_mem_cpu_alloc_delta", metrics)
|
check_func("train_mem_cpu_alloc_delta", metrics)
|
||||||
if torch.cuda.device_count() > 0:
|
if backend_device_count(torch_device) > 0:
|
||||||
check_func("init_mem_gpu_alloc_delta", metrics)
|
check_func("init_mem_gpu_alloc_delta", metrics)
|
||||||
check_func("train_mem_gpu_alloc_delta", metrics)
|
check_func("train_mem_gpu_alloc_delta", metrics)
|
||||||
|
|
||||||
metrics = trainer.evaluate()
|
metrics = trainer.evaluate()
|
||||||
check_func("eval_mem_cpu_alloc_delta", metrics)
|
check_func("eval_mem_cpu_alloc_delta", metrics)
|
||||||
if torch.cuda.device_count() > 0:
|
if backend_device_count(torch_device) > 0:
|
||||||
check_func("eval_mem_gpu_alloc_delta", metrics)
|
check_func("eval_mem_gpu_alloc_delta", metrics)
|
||||||
|
|
||||||
metrics = trainer.predict(RegressionDataset()).metrics
|
metrics = trainer.predict(RegressionDataset()).metrics
|
||||||
check_func("test_mem_cpu_alloc_delta", metrics)
|
check_func("test_mem_cpu_alloc_delta", metrics)
|
||||||
if torch.cuda.device_count() > 0:
|
if backend_device_count(torch_device) > 0:
|
||||||
check_func("test_mem_gpu_alloc_delta", metrics)
|
check_func("test_mem_gpu_alloc_delta", metrics)
|
||||||
|
|
||||||
def test_mem_metrics(self):
|
def test_mem_metrics(self):
|
||||||
|
|||||||
Reference in New Issue
Block a user