add modules_in_block_to_quantize arg in GPTQconfig (#27956)
* add inside_layer_modules arg * fix * change to modules_to_quantize_inside_block * fix * remane again * Apply suggestions from code review Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * better docsting * fix again with less explanation * Update src/transformers/utils/quantization_config.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * style --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
This commit is contained in:
@@ -363,7 +363,7 @@ class GPTQConfig(QuantizationConfigMixin):
|
|||||||
model_seqlen (`int`, *optional*):
|
model_seqlen (`int`, *optional*):
|
||||||
The maximum sequence length that the model can take.
|
The maximum sequence length that the model can take.
|
||||||
block_name_to_quantize (`str`, *optional*):
|
block_name_to_quantize (`str`, *optional*):
|
||||||
The transformers block name to quantize.
|
The transformers block name to quantize. If None, we will infer the block name using common patterns (e.g. model.layers)
|
||||||
module_name_preceding_first_block (`List[str]`, *optional*):
|
module_name_preceding_first_block (`List[str]`, *optional*):
|
||||||
The layers that are preceding the first Transformer block.
|
The layers that are preceding the first Transformer block.
|
||||||
batch_size (`int`, *optional*, defaults to 1):
|
batch_size (`int`, *optional*, defaults to 1):
|
||||||
@@ -380,6 +380,13 @@ class GPTQConfig(QuantizationConfigMixin):
|
|||||||
to `{"version": 1}` if unset.
|
to `{"version": 1}` if unset.
|
||||||
cache_block_outputs (`bool`, *optional*, defaults to `True`):
|
cache_block_outputs (`bool`, *optional*, defaults to `True`):
|
||||||
Whether to cache block outputs to reuse as inputs for the succeeding block.
|
Whether to cache block outputs to reuse as inputs for the succeeding block.
|
||||||
|
modules_in_block_to_quantize (`List[List[str]]`, *optional*):
|
||||||
|
List of list of module names to quantize in the specified block. This argument is useful to exclude certain linear modules from being quantized.
|
||||||
|
The block to quantize can be specified by setting `block_name_to_quantize`. We will quantize each list sequentially. If not set, we will quantize all linear layers.
|
||||||
|
Example: `modules_in_block_to_quantize =[["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], ["self_attn.o_proj"]]`.
|
||||||
|
In this example, we will first quantize the q,k,v layers simultaneously since they are independent.
|
||||||
|
Then, we will quantize `self_attn.o_proj` layer with the q,k,v layers quantized. This way, we will get
|
||||||
|
better results since it reflects the real input `self_attn.o_proj` will get when the model is quantized.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -402,6 +409,7 @@ class GPTQConfig(QuantizationConfigMixin):
|
|||||||
max_input_length: Optional[int] = None,
|
max_input_length: Optional[int] = None,
|
||||||
exllama_config: Optional[Dict[str, Any]] = None,
|
exllama_config: Optional[Dict[str, Any]] = None,
|
||||||
cache_block_outputs: bool = True,
|
cache_block_outputs: bool = True,
|
||||||
|
modules_in_block_to_quantize: Optional[List[List[str]]] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
self.quant_method = QuantizationMethod.GPTQ
|
self.quant_method = QuantizationMethod.GPTQ
|
||||||
@@ -424,6 +432,7 @@ class GPTQConfig(QuantizationConfigMixin):
|
|||||||
self.exllama_config = exllama_config
|
self.exllama_config = exllama_config
|
||||||
self.disable_exllama = kwargs.pop("disable_exllama", None)
|
self.disable_exllama = kwargs.pop("disable_exllama", None)
|
||||||
self.cache_block_outputs = cache_block_outputs
|
self.cache_block_outputs = cache_block_outputs
|
||||||
|
self.modules_in_block_to_quantize = modules_in_block_to_quantize
|
||||||
self.post_init()
|
self.post_init()
|
||||||
|
|
||||||
def get_loading_attributes(self):
|
def get_loading_attributes(self):
|
||||||
@@ -494,6 +503,12 @@ class GPTQConfig(QuantizationConfigMixin):
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"You need optimum > 1.13.2 and auto-gptq > 0.4.2 . Make sure to have that version installed - detected version : optimum {optimum_version} and autogptq {autogptq_version}"
|
f"You need optimum > 1.13.2 and auto-gptq > 0.4.2 . Make sure to have that version installed - detected version : optimum {optimum_version} and autogptq {autogptq_version}"
|
||||||
)
|
)
|
||||||
|
if self.modules_in_block_to_quantize is not None:
|
||||||
|
optimum_version = version.parse(importlib.metadata.version("optimum"))
|
||||||
|
if optimum_version < version.parse("1.15.0"):
|
||||||
|
raise ValueError(
|
||||||
|
"You current version of `optimum` does not support `modules_in_block_to_quantize` quantization argument, please upgrade `optimum` package to a version superior than 1.15.0 ."
|
||||||
|
)
|
||||||
|
|
||||||
def to_dict(self):
|
def to_dict(self):
|
||||||
config_dict = super().to_dict()
|
config_dict = super().to_dict()
|
||||||
|
|||||||
Reference in New Issue
Block a user