Disable inductor config setter by default (#36608)
* Disable inductor config setter by default This is hard to debug and should be off by default * remove default settings in autoquant too * Add info to torchao.md about recommended settings * satisfying Ruff format Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: --------- Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
This commit is contained in:
@@ -150,6 +150,9 @@ output = bf16_model.generate(**input_ids, max_new_tokens=10, cache_implementatio
|
|||||||
print("bf16 model:", benchmark_fn(bf16_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static"))
|
print("bf16 model:", benchmark_fn(bf16_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static"))
|
||||||
```
|
```
|
||||||
|
|
||||||
|
> [!TIP]
|
||||||
|
> For best performance, you can use recommended settings by calling `torchao.quantization.utils.recommended_inductor_config_setter()`
|
||||||
|
|
||||||
</hfoption>
|
</hfoption>
|
||||||
<hfoption id="automatic">
|
<hfoption id="automatic">
|
||||||
|
|
||||||
|
|||||||
@@ -236,7 +236,7 @@ class TorchAoHfQuantizer(HfQuantizer):
|
|||||||
else:
|
else:
|
||||||
assert isinstance(self.quantization_config, TorchAoConfig)
|
assert isinstance(self.quantization_config, TorchAoConfig)
|
||||||
module._parameters[tensor_name] = torch.nn.Parameter(param_value).to(device=target_device)
|
module._parameters[tensor_name] = torch.nn.Parameter(param_value).to(device=target_device)
|
||||||
quantize_(module, self.quantization_config.get_quantize_config())
|
quantize_(module, self.quantization_config.get_apply_tensor_subclass(), set_inductor_config=False)
|
||||||
|
|
||||||
def _process_model_after_weight_loading(self, model, **kwargs):
|
def _process_model_after_weight_loading(self, model, **kwargs):
|
||||||
"""No process required for torchao quantized model"""
|
"""No process required for torchao quantized model"""
|
||||||
@@ -246,7 +246,10 @@ class TorchAoHfQuantizer(HfQuantizer):
|
|||||||
|
|
||||||
model = torch.compile(model, mode="max-autotune")
|
model = torch.compile(model, mode="max-autotune")
|
||||||
model = autoquant(
|
model = autoquant(
|
||||||
model, qtensor_class_list=ALL_AUTOQUANT_CLASS_LIST, **self.quantization_config.quant_type_kwargs
|
model,
|
||||||
|
qtensor_class_list=ALL_AUTOQUANT_CLASS_LIST,
|
||||||
|
set_inductor_config=False,
|
||||||
|
**self.quantization_config.quant_type_kwargs,
|
||||||
)
|
)
|
||||||
return model
|
return model
|
||||||
return
|
return
|
||||||
|
|||||||
Reference in New Issue
Block a user