Disable inductor config setter by default (#36608)

* Disable inductor config setter by default

This is hard to debug and should be off by default

* remove default settings in autoquant too

* Add info to torchao.md about recommended settings

* satisfying Ruff format

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:

---------

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
This commit is contained in:
HDCharles
2025-03-20 06:23:14 -04:00
committed by GitHub
parent 8733297b41
commit 94555437e2
2 changed files with 8 additions and 2 deletions

View File

@@ -150,6 +150,9 @@ output = bf16_model.generate(**input_ids, max_new_tokens=10, cache_implementatio
print("bf16 model:", benchmark_fn(bf16_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static")) print("bf16 model:", benchmark_fn(bf16_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static"))
``` ```
> [!TIP]
> For best performance, you can use recommended settings by calling `torchao.quantization.utils.recommended_inductor_config_setter()`
</hfoption> </hfoption>
<hfoption id="automatic"> <hfoption id="automatic">

View File

@@ -236,7 +236,7 @@ class TorchAoHfQuantizer(HfQuantizer):
else: else:
assert isinstance(self.quantization_config, TorchAoConfig) assert isinstance(self.quantization_config, TorchAoConfig)
module._parameters[tensor_name] = torch.nn.Parameter(param_value).to(device=target_device) module._parameters[tensor_name] = torch.nn.Parameter(param_value).to(device=target_device)
quantize_(module, self.quantization_config.get_quantize_config()) quantize_(module, self.quantization_config.get_apply_tensor_subclass(), set_inductor_config=False)
def _process_model_after_weight_loading(self, model, **kwargs): def _process_model_after_weight_loading(self, model, **kwargs):
"""No process required for torchao quantized model""" """No process required for torchao quantized model"""
@@ -246,7 +246,10 @@ class TorchAoHfQuantizer(HfQuantizer):
model = torch.compile(model, mode="max-autotune") model = torch.compile(model, mode="max-autotune")
model = autoquant( model = autoquant(
model, qtensor_class_list=ALL_AUTOQUANT_CLASS_LIST, **self.quantization_config.quant_type_kwargs model,
qtensor_class_list=ALL_AUTOQUANT_CLASS_LIST,
set_inductor_config=False,
**self.quantization_config.quant_type_kwargs,
) )
return model return model
return return