Enable xpu allocator on caching_allocator_warmup (#39654)
* add xpu allocator Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix typo Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix variable name Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * rm useless default value Signed-off-by: jiqing-feng <jiqing.feng@intel.com> --------- Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
This commit is contained in:
@@ -6021,19 +6021,22 @@ def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: dict,
|
|||||||
|
|
||||||
# This will kick off the caching allocator to avoid having to Malloc afterwards
|
# This will kick off the caching allocator to avoid having to Malloc afterwards
|
||||||
for device, byte_count in total_byte_count.items():
|
for device, byte_count in total_byte_count.items():
|
||||||
if device.type == "cuda":
|
if device.type in ["cuda", "xpu"]:
|
||||||
index = device.index if device.index is not None else torch.cuda.current_device()
|
torch_accelerator_module = getattr(torch, device.type)
|
||||||
device_memory = torch.cuda.mem_get_info(index)[0]
|
index = device.index if device.index is not None else torch_accelerator_module.current_device()
|
||||||
|
device_memory = torch_accelerator_module.mem_get_info(index)[0]
|
||||||
# Allow up to (max device memory - 1.2 GiB) in resource-constrained hardware configurations. Trying to reserve more
|
# Allow up to (max device memory - 1.2 GiB) in resource-constrained hardware configurations. Trying to reserve more
|
||||||
# than that amount might sometimes lead to unnecessary cuda OOM, if the last parameter to be loaded on the device is large,
|
# than that amount might sometimes lead to unnecessary cuda/xpu OOM, if the last parameter to be loaded on the device is large,
|
||||||
# and the remaining reserved memory portion is smaller than the param size -> torch will then try to fully re-allocate all
|
# and the remaining reserved memory portion is smaller than the param size -> torch will then try to fully re-allocate all
|
||||||
# the param size, instead of using the remaining reserved part, and allocating only the difference, which can lead
|
# the param size, instead of using the remaining reserved part, and allocating only the difference, which can lead
|
||||||
# to OOM. See https://github.com/huggingface/transformers/issues/37436#issuecomment-2808982161 for more details.
|
# to OOM. See https://github.com/huggingface/transformers/issues/37436#issuecomment-2808982161 for more details.
|
||||||
# Note that we use an absolute value instead of device proportion here, as a 8GiB device could still allocate too much
|
# Note that we use an absolute value instead of device proportion here, as a 8GiB device could still allocate too much
|
||||||
# if using e.g. 90% of device size, while a 140GiB device would allocate too little
|
# if using e.g. 90% of device size, while a 140GiB device would allocate too little
|
||||||
byte_count = min(byte_count, max(0, int(device_memory - 1.2 * 1024**3)))
|
byte_count = min(byte_count, max(0, int(device_memory - 1.2 * 1024**3)))
|
||||||
# If there is *unused* reserved cuda memory, we can skip/reduce the allocation.
|
# If there is *unused* reserved cuda/xpu memory, we can skip/reduce the allocation.
|
||||||
unused_memory = torch.cuda.memory_reserved(index) - torch.cuda.memory_allocated(index)
|
unused_memory = torch_accelerator_module.memory_reserved(
|
||||||
|
index
|
||||||
|
) - torch_accelerator_module.memory_allocated(index)
|
||||||
byte_count = max(0, byte_count - unused_memory)
|
byte_count = max(0, byte_count - unused_memory)
|
||||||
# Allocate memory
|
# Allocate memory
|
||||||
_ = torch.empty(byte_count // factor, dtype=torch.float16, device=device, requires_grad=False)
|
_ = torch.empty(byte_count // factor, dtype=torch.float16, device=device, requires_grad=False)
|
||||||
|
|||||||
Reference in New Issue
Block a user