From 8db4d791618d396dba72742534ac456e5b9b5318 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Tue, 29 Jul 2025 22:06:52 +0800 Subject: [PATCH] Enable xpu allocator on caching_allocator_warmup (#39654) * add xpu allocator Signed-off-by: jiqing-feng * fix typo Signed-off-by: jiqing-feng * fix variable name Signed-off-by: jiqing-feng * rm useless default value Signed-off-by: jiqing-feng --------- Signed-off-by: jiqing-feng --- src/transformers/modeling_utils.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 5a7c6eb087..9db73955b0 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -6021,19 +6021,22 @@ def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: dict, # This will kick off the caching allocator to avoid having to Malloc afterwards for device, byte_count in total_byte_count.items(): - if device.type == "cuda": - index = device.index if device.index is not None else torch.cuda.current_device() - device_memory = torch.cuda.mem_get_info(index)[0] + if device.type in ["cuda", "xpu"]: + torch_accelerator_module = getattr(torch, device.type) + index = device.index if device.index is not None else torch_accelerator_module.current_device() + device_memory = torch_accelerator_module.mem_get_info(index)[0] # Allow up to (max device memory - 1.2 GiB) in resource-constrained hardware configurations. Trying to reserve more - # than that amount might sometimes lead to unnecessary cuda OOM, if the last parameter to be loaded on the device is large, + # than that amount might sometimes lead to unnecessary cuda/xpu OOM, if the last parameter to be loaded on the device is large, # and the remaining reserved memory portion is smaller than the param size -> torch will then try to fully re-allocate all # the param size, instead of using the remaining reserved part, and allocating only the difference, which can lead # to OOM. See https://github.com/huggingface/transformers/issues/37436#issuecomment-2808982161 for more details. # Note that we use an absolute value instead of device proportion here, as a 8GiB device could still allocate too much # if using e.g. 90% of device size, while a 140GiB device would allocate too little byte_count = min(byte_count, max(0, int(device_memory - 1.2 * 1024**3))) - # If there is *unused* reserved cuda memory, we can skip/reduce the allocation. - unused_memory = torch.cuda.memory_reserved(index) - torch.cuda.memory_allocated(index) + # If there is *unused* reserved cuda/xpu memory, we can skip/reduce the allocation. + unused_memory = torch_accelerator_module.memory_reserved( + index + ) - torch_accelerator_module.memory_allocated(index) byte_count = max(0, byte_count - unused_memory) # Allocate memory _ = torch.empty(byte_count // factor, dtype=torch.float16, device=device, requires_grad=False)