Enable xpu allocator on caching_allocator_warmup (#39654)

* add xpu allocator

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix typo

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix variable name

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* rm useless default value

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

---------

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
This commit is contained in:
jiqing-feng
2025-07-29 22:06:52 +08:00
committed by GitHub
parent fb141e2c90
commit 8db4d79161

View File

@@ -6021,19 +6021,22 @@ def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: dict,
# This will kick off the caching allocator to avoid having to Malloc afterwards # This will kick off the caching allocator to avoid having to Malloc afterwards
for device, byte_count in total_byte_count.items(): for device, byte_count in total_byte_count.items():
if device.type == "cuda": if device.type in ["cuda", "xpu"]:
index = device.index if device.index is not None else torch.cuda.current_device() torch_accelerator_module = getattr(torch, device.type)
device_memory = torch.cuda.mem_get_info(index)[0] index = device.index if device.index is not None else torch_accelerator_module.current_device()
device_memory = torch_accelerator_module.mem_get_info(index)[0]
# Allow up to (max device memory - 1.2 GiB) in resource-constrained hardware configurations. Trying to reserve more # Allow up to (max device memory - 1.2 GiB) in resource-constrained hardware configurations. Trying to reserve more
# than that amount might sometimes lead to unnecessary cuda OOM, if the last parameter to be loaded on the device is large, # than that amount might sometimes lead to unnecessary cuda/xpu OOM, if the last parameter to be loaded on the device is large,
# and the remaining reserved memory portion is smaller than the param size -> torch will then try to fully re-allocate all # and the remaining reserved memory portion is smaller than the param size -> torch will then try to fully re-allocate all
# the param size, instead of using the remaining reserved part, and allocating only the difference, which can lead # the param size, instead of using the remaining reserved part, and allocating only the difference, which can lead
# to OOM. See https://github.com/huggingface/transformers/issues/37436#issuecomment-2808982161 for more details. # to OOM. See https://github.com/huggingface/transformers/issues/37436#issuecomment-2808982161 for more details.
# Note that we use an absolute value instead of device proportion here, as a 8GiB device could still allocate too much # Note that we use an absolute value instead of device proportion here, as a 8GiB device could still allocate too much
# if using e.g. 90% of device size, while a 140GiB device would allocate too little # if using e.g. 90% of device size, while a 140GiB device would allocate too little
byte_count = min(byte_count, max(0, int(device_memory - 1.2 * 1024**3))) byte_count = min(byte_count, max(0, int(device_memory - 1.2 * 1024**3)))
# If there is *unused* reserved cuda memory, we can skip/reduce the allocation. # If there is *unused* reserved cuda/xpu memory, we can skip/reduce the allocation.
unused_memory = torch.cuda.memory_reserved(index) - torch.cuda.memory_allocated(index) unused_memory = torch_accelerator_module.memory_reserved(
index
) - torch_accelerator_module.memory_allocated(index)
byte_count = max(0, byte_count - unused_memory) byte_count = max(0, byte_count - unused_memory)
# Allocate memory # Allocate memory
_ = torch.empty(byte_count // factor, dtype=torch.float16, device=device, requires_grad=False) _ = torch.empty(byte_count // factor, dtype=torch.float16, device=device, requires_grad=False)