restrict cache allocator to non quantized model (#36428)

2025-02-26 22:16:15 +01:00
parent a7fbab33ae
commit 8ede897c30
1 changed files with 2 additions and 1 deletions
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -4839,7 +4839,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
        model.expected_keys = expected_keys
        if device_map is not None:
            expanded_device_map = expand_device_map(device_map, original_loaded_keys, start_prefix)
-            caching_allocator_warmup(model, expanded_device_map, dtype)
+            if hf_quantizer is None:
+                caching_allocator_warmup(model_to_load, expanded_device_map, dtype)

        if device_map is not None and is_safetensors:
            param_device_map = expanded_device_map