@@ -39,10 +39,10 @@ pip install --upgrade accelerate torch
|
|||||||
By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type.
|
By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type.
|
||||||
|
|
||||||
```py
|
```py
|
||||||
from transformers import FP8Config, AutoModelForCausalLM, AutoTokenizer
|
from transformers import FineGrainedFP8Config, AutoModelForCausalLM, AutoTokenizer
|
||||||
|
|
||||||
model_name = "meta-llama/Meta-Llama-3-8B"
|
model_name = "meta-llama/Meta-Llama-3-8B"
|
||||||
quantization_config = FP8Config()
|
quantization_config = FineGrainedFP8Config()
|
||||||
quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config)
|
quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config)
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
|
|||||||
Reference in New Issue
Block a user