Name change AOPermod -> ModuleFqn (#38456)
Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> Co-authored-by: Mohamed Mekkouri <93391238+MekkCyber@users.noreply.github.com>
This commit is contained in:
@@ -62,7 +62,7 @@ Install torchao from PyPi or the PyTorch index with the following commands.
|
||||
# Stable release from Pypi which will default to CUDA 12.6
|
||||
pip install --upgrade torchao transformers
|
||||
```
|
||||
</hfoption>
|
||||
</hfoption>
|
||||
<hfoption id="PyTorch Index">
|
||||
Stable Release from the PyTorch index
|
||||
```bash
|
||||
@@ -276,18 +276,18 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
|
||||
### Per Module Quantization
|
||||
#### 1. Skip quantization for certain layers
|
||||
With `AOPerModuleConfig` we can specify a default configuration for all layers while skipping quantization for certain layers.
|
||||
With `ModuleFqnToConfig` we can specify a default configuration for all layers while skipping quantization for certain layers.
|
||||
```py
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
|
||||
|
||||
model_id = "meta-llama/Llama-3.1-8B-Instruct"
|
||||
|
||||
from torchao.quantization import Int4WeightOnlyConfig, AOPerModuleConfig
|
||||
from torchao.quantization import Int4WeightOnlyConfig, ModuleFqnToConfig
|
||||
config = Int4WeightOnlyConfig(group_size=128)
|
||||
|
||||
# set default to int4 (for linears), and skip quantizing `model.layers.0.self_attn.q_proj`
|
||||
quant_config = AOPerModuleConfig({"_default": config, "model.layers.0.self_attn.q_proj": None})
|
||||
quant_config = ModuleFqnToConfig({"_default": config, "model.layers.0.self_attn.q_proj": None})
|
||||
quantization_config = TorchAoConfig(quant_type=quant_config)
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
|
||||
# lm_head is not quantized and model.layers.0.self_attn.q_proj is not quantized
|
||||
@@ -311,7 +311,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
|
||||
|
||||
model_id = "facebook/opt-125m"
|
||||
|
||||
from torchao.quantization import Int4WeightOnlyConfig, AOPerModuleConfig, Int8DynamicActivationInt4WeightConfig, IntxWeightOnlyConfig, PerAxis, MappingType
|
||||
from torchao.quantization import Int4WeightOnlyConfig, ModuleFqnToConfig, Int8DynamicActivationInt4WeightConfig, IntxWeightOnlyConfig, PerAxis, MappingType
|
||||
|
||||
weight_dtype = torch.int8
|
||||
granularity = PerAxis(0)
|
||||
@@ -322,7 +322,7 @@ embedding_config = IntxWeightOnlyConfig(
|
||||
mapping_type=mapping_type,
|
||||
)
|
||||
linear_config = Int8DynamicActivationInt4WeightConfig(group_size=128)
|
||||
quant_config = AOPerModuleConfig({"_default": linear_config, "model.decoder.embed_tokens": embedding_config, "model.decoder.embed_positions": None})
|
||||
quant_config = ModuleFqnToConfig({"_default": linear_config, "model.decoder.embed_tokens": embedding_config, "model.decoder.embed_positions": None})
|
||||
# set `include_embedding` to True in order to include embedding in quantization
|
||||
# when `include_embedding` is True, we'll remove input embedding from `modules_not_to_convert` as well
|
||||
quantization_config = TorchAoConfig(quant_type=quant_config, include_embedding=True)
|
||||
@@ -427,8 +427,8 @@ quantized_model.save_pretrained(output_dir, safe_serialization=False)
|
||||
|
||||
# reload the quantized model
|
||||
reloaded_model = AutoModelForCausalLM.from_pretrained(
|
||||
output_dir,
|
||||
device_map="auto",
|
||||
output_dir,
|
||||
device_map="auto",
|
||||
torch_dtype=torch.bfloat16
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
|
||||
@@ -463,8 +463,8 @@ quantized_model.save_pretrained(output_dir, safe_serialization=False)
|
||||
|
||||
# reload the quantized model
|
||||
reloaded_model = AutoModelForCausalLM.from_pretrained(
|
||||
output_dir,
|
||||
device_map="cpu",
|
||||
output_dir,
|
||||
device_map="cpu",
|
||||
torch_dtype=torch.bfloat16
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
|
||||
|
||||
Reference in New Issue
Block a user