enable xpu on kv-cache and hqq doc (#39246)
Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
This commit is contained in:
@@ -44,7 +44,7 @@ import torch
|
|||||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
|
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
|
||||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
|
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
|
||||||
inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)
|
inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)
|
||||||
|
|
||||||
model.generate(**inputs, do_sample=False, max_new_tokens=20, use_cache=False)
|
model.generate(**inputs, do_sample=False, max_new_tokens=20, use_cache=False)
|
||||||
@@ -59,7 +59,7 @@ import torch
|
|||||||
from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
|
from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
|
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
|
||||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
|
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
|
||||||
inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)
|
inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)
|
||||||
|
|
||||||
past_key_values = DynamicCache()
|
past_key_values = DynamicCache()
|
||||||
@@ -142,13 +142,14 @@ Enable [`QuantizedCache`] by configuring `cache_implementation="quantized"` in [
|
|||||||
For [`HQQQuantizedCache`], we recommend setting the `axis-key` and `axis-value` parameters to `1`.
|
For [`HQQQuantizedCache`], we recommend setting the `axis-key` and `axis-value` parameters to `1`.
|
||||||
|
|
||||||
```py
|
```py
|
||||||
|
import torch
|
||||||
from transformers import AutoTokenizer, AutoModelForCausalLM, HQQQuantizedCache, QuantizedCacheConfig
|
from transformers import AutoTokenizer, AutoModelForCausalLM, HQQQuantizedCache, QuantizedCacheConfig
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
|
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
|
||||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
|
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
|
||||||
inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)
|
inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)
|
||||||
|
|
||||||
out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"axis-key": 1, "axis-value": 1, "backend": "hqq"})
|
out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"backend": "HQQ"})
|
||||||
print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
|
print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
|
||||||
I like rock music because it's loud and energetic. It's a great way to express myself and rel
|
I like rock music because it's loud and energetic. It's a great way to express myself and rel
|
||||||
```
|
```
|
||||||
@@ -159,13 +160,14 @@ I like rock music because it's loud and energetic. It's a great way to express m
|
|||||||
For [`QuantoQuantizedCache`], we recommend setting the `axis-key` and `axis-value` parameters to `0`.
|
For [`QuantoQuantizedCache`], we recommend setting the `axis-key` and `axis-value` parameters to `0`.
|
||||||
|
|
||||||
```py
|
```py
|
||||||
|
import torch
|
||||||
from transformers import AutoTokenizer, AutoModelForCausalLM, QuantoQuantizedCache, QuantizedCacheConfig
|
from transformers import AutoTokenizer, AutoModelForCausalLM, QuantoQuantizedCache, QuantizedCacheConfig
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
|
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
|
||||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
|
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
|
||||||
inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)
|
inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)
|
||||||
|
|
||||||
out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"nbits": 4, "axis-key": 0, "axis-value": 0, "backend": "quanto"})
|
out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"nbits": 4, "backend": "quanto"})
|
||||||
print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
|
print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
|
||||||
I like rock music because it's loud and energetic. It's a great way to express myself and rel
|
I like rock music because it's loud and energetic. It's a great way to express myself and rel
|
||||||
```
|
```
|
||||||
@@ -207,14 +209,14 @@ import torch
|
|||||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
|
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
|
||||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
|
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map={"": 0})
|
||||||
inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
|
inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
|
||||||
|
|
||||||
out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="offloaded_static")
|
out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="offloaded_static")
|
||||||
tokenizer.batch_decode(out, skip_special_tokens=True)[0]
|
tokenizer.batch_decode(out, skip_special_tokens=True)[0]
|
||||||
"Hello, my name is [Your Name], and I am a [Your Profession] with [Number of Years] of"
|
"Hello, my name is [Your Name], and I am a [Your Profession] with [Number of Years] of"
|
||||||
```
|
```
|
||||||
Cache offloading requires a CUDA GPU.
|
Cache offloading requires a CUDA GPU or Intel XPU.
|
||||||
|
|
||||||
### Sliding window cache
|
### Sliding window cache
|
||||||
|
|
||||||
@@ -227,7 +229,7 @@ import torch
|
|||||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
|
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
|
||||||
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16).to("cuda:0")
|
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16, device_map="auto")
|
||||||
inputs = tokenizer("Yesterday I was on a rock concert and.", return_tensors="pt").to(model.device)
|
inputs = tokenizer("Yesterday I was on a rock concert and.", return_tensors="pt").to(model.device)
|
||||||
|
|
||||||
out = model.generate(**inputs, do_sample=False, max_new_tokens=30, cache_implementation="sliding_window")
|
out = model.generate(**inputs, do_sample=False, max_new_tokens=30, cache_implementation="sliding_window")
|
||||||
@@ -306,15 +308,15 @@ import torch
|
|||||||
from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache, StaticCache
|
from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache, StaticCache
|
||||||
|
|
||||||
model_id = "meta-llama/Llama-2-7b-chat-hf"
|
model_id = "meta-llama/Llama-2-7b-chat-hf"
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda")
|
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map={"": 0})
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||||
|
|
||||||
# Init StaticCache with big enough max-length (1024 tokens for the below example)
|
# Init StaticCache with big enough max-length (1024 tokens for the below example)
|
||||||
# You can also init a DynamicCache, if that suits you better
|
# You can also init a DynamicCache, if that suits you better
|
||||||
prompt_cache = StaticCache(config=model.config, max_batch_size=1, max_cache_len=1024, device="cuda", dtype=torch.bfloat16)
|
prompt_cache = StaticCache(config=model.config, max_batch_size=1, max_cache_len=1024, device=model.device.type, dtype=torch.bfloat16)
|
||||||
|
|
||||||
INITIAL_PROMPT = "You are a helpful assistant. "
|
INITIAL_PROMPT = "You are a helpful assistant. "
|
||||||
inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to("cuda")
|
inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to(model.device.type)
|
||||||
# This is the common prompt cached, we need to run forward without grad to be able to copy
|
# This is the common prompt cached, we need to run forward without grad to be able to copy
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
prompt_cache = model(**inputs_initial_prompt, past_key_values = prompt_cache).past_key_values
|
prompt_cache = model(**inputs_initial_prompt, past_key_values = prompt_cache).past_key_values
|
||||||
@@ -322,7 +324,7 @@ with torch.no_grad():
|
|||||||
prompts = ["Help me to write a blogpost about travelling.", "What is the capital of France?"]
|
prompts = ["Help me to write a blogpost about travelling.", "What is the capital of France?"]
|
||||||
responses = []
|
responses = []
|
||||||
for prompt in prompts:
|
for prompt in prompts:
|
||||||
new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to("cuda")
|
new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to(model.device.type)
|
||||||
past_key_values = copy.deepcopy(prompt_cache)
|
past_key_values = copy.deepcopy(prompt_cache)
|
||||||
outputs = model.generate(**new_inputs, past_key_values=past_key_values,max_new_tokens=20)
|
outputs = model.generate(**new_inputs, past_key_values=past_key_values,max_new_tokens=20)
|
||||||
response = tokenizer.batch_decode(outputs)[0]
|
response = tokenizer.batch_decode(outputs)[0]
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ rendered properly in your Markdown viewer.
|
|||||||
|
|
||||||
HQQ further supports fine-tuning with [PEFT](https://huggingface.co/docs/peft) and is fully compatible with [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) for even faster inference and training.
|
HQQ further supports fine-tuning with [PEFT](https://huggingface.co/docs/peft) and is fully compatible with [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) for even faster inference and training.
|
||||||
|
|
||||||
Install HQQ with the following command to get the latest version and to build its corresponding CUDA kernels.
|
Install HQQ with the following command to get the latest version and to build its corresponding CUDA kernels if you are using a cuda device. It also support Intel XPU with pure pytorch implementation.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install hqq
|
pip install hqq
|
||||||
@@ -34,13 +34,14 @@ You can choose to either replace all the linear layers in a model with the same
|
|||||||
Quantize a model by creating a [`HqqConfig`] and specifying the `nbits` and `group_size` to replace for all the linear layers ([torch.nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html)) of the model.
|
Quantize a model by creating a [`HqqConfig`] and specifying the `nbits` and `group_size` to replace for all the linear layers ([torch.nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html)) of the model.
|
||||||
|
|
||||||
``` py
|
``` py
|
||||||
|
import torch
|
||||||
from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig
|
from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig
|
||||||
|
|
||||||
quant_config = HqqConfig(nbits=8, group_size=64)
|
quant_config = HqqConfig(nbits=8, group_size=64)
|
||||||
model = transformers.AutoModelForCausalLM.from_pretrained(
|
model = transformers.AutoModelForCausalLM.from_pretrained(
|
||||||
"meta-llama/Llama-3.1-8B",
|
"meta-llama/Llama-3.1-8B",
|
||||||
torch_dtype=torch.float16,
|
torch_dtype=torch.float16,
|
||||||
device_map="cuda",
|
device_map="auto",
|
||||||
quantization_config=quant_config
|
quantization_config=quant_config
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
@@ -67,7 +68,7 @@ quant_config = HqqConfig(dynamic_config={
|
|||||||
model = transformers.AutoModelForCausalLM.from_pretrained(
|
model = transformers.AutoModelForCausalLM.from_pretrained(
|
||||||
"meta-llama/Llama-3.1-8B",
|
"meta-llama/Llama-3.1-8B",
|
||||||
torch_dtype=torch.float16,
|
torch_dtype=torch.float16,
|
||||||
device_map="cuda",
|
device_map="auto",
|
||||||
quantization_config=quant_config
|
quantization_config=quant_config
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|||||||
Reference in New Issue
Block a user