From 94555437e277949e6b3be9e97317df2a5e046d95 Mon Sep 17 00:00:00 2001 From: HDCharles <39544797+HDCharles@users.noreply.github.com> Date: Thu, 20 Mar 2025 06:23:14 -0400 Subject: [PATCH] Disable inductor config setter by default (#36608) * Disable inductor config setter by default This is hard to debug and should be off by default * remove default settings in autoquant too * Add info to torchao.md about recommended settings * satisfying Ruff format Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: --------- Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> --- docs/source/en/quantization/torchao.md | 3 +++ src/transformers/quantizers/quantizer_torchao.py | 7 +++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/source/en/quantization/torchao.md b/docs/source/en/quantization/torchao.md index 31e2d4f020..42231d759c 100644 --- a/docs/source/en/quantization/torchao.md +++ b/docs/source/en/quantization/torchao.md @@ -150,6 +150,9 @@ output = bf16_model.generate(**input_ids, max_new_tokens=10, cache_implementatio print("bf16 model:", benchmark_fn(bf16_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static")) ``` +> [!TIP] +> For best performance, you can use recommended settings by calling `torchao.quantization.utils.recommended_inductor_config_setter()` + diff --git a/src/transformers/quantizers/quantizer_torchao.py b/src/transformers/quantizers/quantizer_torchao.py index 1dddaf19d4..d15f09be8e 100644 --- a/src/transformers/quantizers/quantizer_torchao.py +++ b/src/transformers/quantizers/quantizer_torchao.py @@ -236,7 +236,7 @@ class TorchAoHfQuantizer(HfQuantizer): else: assert isinstance(self.quantization_config, TorchAoConfig) module._parameters[tensor_name] = torch.nn.Parameter(param_value).to(device=target_device) - quantize_(module, self.quantization_config.get_quantize_config()) + quantize_(module, self.quantization_config.get_apply_tensor_subclass(), set_inductor_config=False) def _process_model_after_weight_loading(self, model, **kwargs): """No process required for torchao quantized model""" @@ -246,7 +246,10 @@ class TorchAoHfQuantizer(HfQuantizer): model = torch.compile(model, mode="max-autotune") model = autoquant( - model, qtensor_class_list=ALL_AUTOQUANT_CLASS_LIST, **self.quantization_config.quant_type_kwargs + model, + qtensor_class_list=ALL_AUTOQUANT_CLASS_LIST, + set_inductor_config=False, + **self.quantization_config.quant_type_kwargs, ) return model return