[docs] Increase visibility of torch_dtype="auto" (#35067)
* auto-dtype * feedback
This commit is contained in:
@@ -405,7 +405,7 @@ To load a model in 4-bit for inference, use the `load_in_4bit` parameter. The `d
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
model_name = "bigscience/bloom-2b5"
|
||||
model_4bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True)
|
||||
model_4bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", load_in_4bit=True)
|
||||
```
|
||||
|
||||
To load a model in 4-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 600MB of memory to the first GPU and 1GB of memory to the second GPU:
|
||||
@@ -414,7 +414,7 @@ To load a model in 4-bit for inference with multiple GPUs, you can control how m
|
||||
max_memory_mapping = {0: "600MB", 1: "1GB"}
|
||||
model_name = "bigscience/bloom-3b"
|
||||
model_4bit = AutoModelForCausalLM.from_pretrained(
|
||||
model_name, device_map="auto", load_in_4bit=True, max_memory=max_memory_mapping
|
||||
model_name, torch_dtype="auto", device_map="auto", load_in_4bit=True, max_memory=max_memory_mapping
|
||||
)
|
||||
```
|
||||
|
||||
@@ -432,7 +432,7 @@ To load a model in 8-bit for inference, use the `load_in_8bit` parameter. The `d
|
||||
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
|
||||
|
||||
model_name = "bigscience/bloom-2b5"
|
||||
model_8bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=BitsAndBytesConfig(load_in_8bit=True))
|
||||
model_8bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", quantization_config=BitsAndBytesConfig(load_in_8bit=True))
|
||||
```
|
||||
|
||||
If you're loading a model in 8-bit for text generation, you should use the [`~transformers.GenerationMixin.generate`] method instead of the [`Pipeline`] function which is not optimized for 8-bit models and will be slower. Some sampling strategies, like nucleus sampling, are also not supported by the [`Pipeline`] for 8-bit models. You should also place all inputs on the same device as the model:
|
||||
@@ -442,7 +442,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
||||
|
||||
model_name = "bigscience/bloom-2b5"
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
model_8bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=BitsAndBytesConfig(load_in_8bit=True))
|
||||
model_8bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", quantization_config=BitsAndBytesConfig(load_in_8bit=True))
|
||||
|
||||
prompt = "Hello, my llama is cute"
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
|
||||
@@ -456,7 +456,7 @@ To load a model in 4-bit for inference with multiple GPUs, you can control how m
|
||||
max_memory_mapping = {0: "1GB", 1: "2GB"}
|
||||
model_name = "bigscience/bloom-3b"
|
||||
model_8bit = AutoModelForCausalLM.from_pretrained(
|
||||
model_name, device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping
|
||||
model_name, torch_dtype="auto", device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping
|
||||
)
|
||||
```
|
||||
|
||||
@@ -515,7 +515,7 @@ quantization_config = BitsAndBytesConfig(
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
|
||||
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", quantization_config=quantization_config)
|
||||
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype="auto", quantization_config=quantization_config)
|
||||
|
||||
# enable BetterTransformer
|
||||
model = model.to_bettertransformer()
|
||||
|
||||
Reference in New Issue
Block a user