From 48e179857ce4574efec0210a7405eeb6f93b3974 Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Mon, 7 Apr 2025 18:33:48 +0200 Subject: [PATCH] Remove HQQ from caching allocator warmup (#37347) Update modeling_utils.py --- src/transformers/modeling_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 470bbe4ad9..5d2638b34e 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -4612,6 +4612,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix ): # Useful flags is_quantized = hf_quantizer is not None + is_hqq = is_quantized and hf_quantizer.quantization_config.quant_method == QuantizationMethod.HQQ is_hqq_or_bnb = is_quantized and hf_quantizer.quantization_config.quant_method in [ QuantizationMethod.HQQ, QuantizationMethod.BITS_AND_BYTES, @@ -4777,7 +4778,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix expected_keys = hf_quantizer.update_expected_keys(model_to_load, expected_keys, checkpoint_keys) # Warmup cuda to load the weights much faster on devices - if device_map is not None: + if device_map is not None and not is_hqq: expanded_device_map = expand_device_map(device_map, expected_keys) caching_allocator_warmup(model_to_load, expanded_device_map, factor=2 if hf_quantizer is None else 4)