Enable xpu allocator on caching_allocator_warmup (#39654)

* add xpu allocator Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix typo Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix variable name Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * rm useless default value Signed-off-by: jiqing-feng <jiqing.feng@intel.com> --------- Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
2025-07-29 22:06:52 +08:00
parent fb141e2c90
commit 8db4d79161
1 changed files with 9 additions and 6 deletions
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -6021,19 +6021,22 @@ def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: dict,
    # This will kick off the caching allocator to avoid having to Malloc afterwards
    for device, byte_count in total_byte_count.items():
-        if device.type == "cuda":
+        if device.type in ["cuda", "xpu"]:
-            index = device.index if device.index is not None else torch.cuda.current_device()
+            torch_accelerator_module = getattr(torch, device.type)
-            device_memory = torch.cuda.mem_get_info(index)[0]
+            index = device.index if device.index is not None else torch_accelerator_module.current_device()
            device_memory = torch_accelerator_module.mem_get_info(index)[0]
            # Allow up to (max device memory - 1.2 GiB) in resource-constrained hardware configurations. Trying to reserve more
-            # than that amount might sometimes lead to unnecessary cuda OOM, if the last parameter to be loaded on the device is large,
+            # than that amount might sometimes lead to unnecessary cuda/xpu OOM, if the last parameter to be loaded on the device is large,
            # and the remaining reserved memory portion is smaller than the param size -> torch will then try to fully re-allocate all
            # the param size, instead of using the remaining reserved part, and allocating only the difference, which can lead
            # to OOM. See https://github.com/huggingface/transformers/issues/37436#issuecomment-2808982161 for more details.
            # Note that we use an absolute value instead of device proportion here, as a 8GiB device could still allocate too much
            # if using e.g. 90% of device size, while a 140GiB device would allocate too little
            byte_count = min(byte_count, max(0, int(device_memory - 1.2 * 1024**3)))
-            # If there is *unused* reserved cuda memory, we can skip/reduce the allocation.
+            # If there is *unused* reserved cuda/xpu memory, we can skip/reduce the allocation.
-            unused_memory = torch.cuda.memory_reserved(index) - torch.cuda.memory_allocated(index)
+            unused_memory = torch_accelerator_module.memory_reserved(
                index
            ) - torch_accelerator_module.memory_allocated(index)
            byte_count = max(0, byte_count - unused_memory)
        # Allocate memory
        _ = torch.empty(byte_count // factor, dtype=torch.float16, device=device, requires_grad=False)