|
|
|
|
@@ -49,6 +49,7 @@ Check the table below to see if your hardware is compatible.
|
|
|
|
|
| Component | Compatibility |
|
|
|
|
|
|----------|----------------|
|
|
|
|
|
| CUDA Versions | ✅ cu118, cu126, cu128 |
|
|
|
|
|
| XPU Versions | ✅ pytorch2.8 |
|
|
|
|
|
| CPU | ✅ change `device_map="cpu"` (see examples below) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -278,6 +279,71 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
|
|
|
|
|
</hfoption>
|
|
|
|
|
</hfoptions>
|
|
|
|
|
|
|
|
|
|
### Intel XPU
|
|
|
|
|
<hfoptions id="examples-Intel-XPU">
|
|
|
|
|
<hfoption id="int8-dynamic-and-weight-only">
|
|
|
|
|
|
|
|
|
|
```py
|
|
|
|
|
import torch
|
|
|
|
|
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
|
|
|
|
|
from torchao.quantization import Int8DynamicActivationInt8WeightConfig, Int8WeightOnlyConfig
|
|
|
|
|
|
|
|
|
|
quant_config = Int8DynamicActivationInt8WeightConfig()
|
|
|
|
|
# or int8 weight only quantization
|
|
|
|
|
# quant_config = Int8WeightOnlyConfig()
|
|
|
|
|
quantization_config = TorchAoConfig(quant_type=quant_config)
|
|
|
|
|
|
|
|
|
|
# Load and quantize the model
|
|
|
|
|
quantized_model = AutoModelForCausalLM.from_pretrained(
|
|
|
|
|
"meta-llama/Llama-3.1-8B-Instruct",
|
|
|
|
|
torch_dtype="auto",
|
|
|
|
|
device_map="auto",
|
|
|
|
|
quantization_config=quantization_config
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
|
|
|
|
|
input_text = "What are we having for dinner?"
|
|
|
|
|
input_ids = tokenizer(input_text, return_tensors="pt").to("xpu")
|
|
|
|
|
|
|
|
|
|
# auto-compile the quantized model with `cache_implementation="static"` to get speed up
|
|
|
|
|
output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
|
|
|
|
|
print(tokenizer.decode(output[0], skip_special_tokens=True))
|
|
|
|
|
```
|
|
|
|
|
</hfoption>
|
|
|
|
|
|
|
|
|
|
<hfoption id="int4-weight-only">
|
|
|
|
|
|
|
|
|
|
```py
|
|
|
|
|
import torch
|
|
|
|
|
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
|
|
|
|
|
from torchao.quantization import Int4WeightOnlyConfig
|
|
|
|
|
from torchao.dtypes import Int4XPULayout
|
|
|
|
|
from torchao.quantization.quant_primitives import ZeroPointDomain
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
quant_config = Int4WeightOnlyConfig(group_size=128, layout=Int4XPULayout(), zero_point_domain=ZeroPointDomain.INT)
|
|
|
|
|
quantization_config = TorchAoConfig(quant_type=quant_config)
|
|
|
|
|
|
|
|
|
|
# Load and quantize the model
|
|
|
|
|
quantized_model = AutoModelForCausalLM.from_pretrained(
|
|
|
|
|
"meta-llama/Llama-3.1-8B-Instruct",
|
|
|
|
|
torch_dtype="auto",
|
|
|
|
|
device_map="auto",
|
|
|
|
|
quantization_config=quantization_config
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
|
|
|
|
|
input_text = "What are we having for dinner?"
|
|
|
|
|
input_ids = tokenizer(input_text, return_tensors="pt").to("xpu")
|
|
|
|
|
|
|
|
|
|
# auto-compile the quantized model with `cache_implementation="static"` to get speed up
|
|
|
|
|
output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
|
|
|
|
|
print(tokenizer.decode(output[0], skip_special_tokens=True))
|
|
|
|
|
```
|
|
|
|
|
</hfoption>
|
|
|
|
|
</hfoptions>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
### CPU
|
|
|
|
|
<hfoptions id="examples-CPU">
|
|
|
|
|
<hfoption id="int8-dynamic-and-weight-only">
|
|
|
|
|
@@ -363,7 +429,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
|
|
|
|
|
|
|
|
# Manual Testing
|
|
|
|
|
prompt = "Hey, are you conscious? Can you talk to me?"
|
|
|
|
|
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
|
|
|
|
|
inputs = tokenizer(prompt, return_tensors="pt").to(quantized_model.device.type)
|
|
|
|
|
generated_ids = quantized_model.generate(**inputs, max_new_tokens=128)
|
|
|
|
|
output_text = tokenizer.batch_decode(
|
|
|
|
|
generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
|
|
|
|
@@ -434,7 +500,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(
|
|
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
|
|
|
|
|
input_text = "What are we having for dinner?"
|
|
|
|
|
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
|
|
|
|
|
input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device.type)
|
|
|
|
|
|
|
|
|
|
# auto-compile the quantized model with `cache_implementation="static"` to get speed up
|
|
|
|
|
output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
|
|
|
|
|
@@ -474,7 +540,7 @@ tokenizer.push_to_hub(f"{USER_ID}/llama3-8b-int4wo-128")
|
|
|
|
|
|
|
|
|
|
## Loading quantized models
|
|
|
|
|
|
|
|
|
|
Loading a quantized model depends on the quantization scheme. For quantization schemes, like int8 and float8, you can quantize the model on any device and also load it on any device. The example below demonstrates quantizing a model on the CPU and then loading it on CUDA.
|
|
|
|
|
Loading a quantized model depends on the quantization scheme. For quantization schemes, like int8 and float8, you can quantize the model on any device and also load it on any device. The example below demonstrates quantizing a model on the CPU and then loading it on CUDA or XPU.
|
|
|
|
|
```py
|
|
|
|
|
import torch
|
|
|
|
|
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
|
|
|
|
|
@@ -491,7 +557,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(
|
|
|
|
|
quantization_config=quantization_config
|
|
|
|
|
)
|
|
|
|
|
# save the quantized model
|
|
|
|
|
output_dir = "llama-3.1-8b-torchao-int8-cuda"
|
|
|
|
|
output_dir = "llama-3.1-8b-torchao-int8"
|
|
|
|
|
quantized_model.save_pretrained(output_dir, safe_serialization=False)
|
|
|
|
|
|
|
|
|
|
# reload the quantized model
|
|
|
|
|
@@ -502,7 +568,7 @@ reloaded_model = AutoModelForCausalLM.from_pretrained(
|
|
|
|
|
)
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
|
|
|
|
|
input_text = "What are we having for dinner?"
|
|
|
|
|
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
|
|
|
|
|
input_ids = tokenizer(input_text, return_tensors="pt").to(reloaded_model.device.type)
|
|
|
|
|
|
|
|
|
|
output = reloaded_model.generate(**input_ids, max_new_tokens=10)
|
|
|
|
|
print(tokenizer.decode(output[0], skip_special_tokens=True))
|
|
|
|
|
|