From 2a002d073a337051bdc3fbdc95ff1bc0399ae2bb Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Wed, 6 Mar 2024 06:30:47 +0100 Subject: [PATCH] [`Docs` / `Awq`] Add docs on exllamav2 + AWQ (#29474) * add docs on exllamav2 + AWQ * Update docs/source/en/quantization.md --- docs/source/en/quantization.md | 39 ++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/docs/source/en/quantization.md b/docs/source/en/quantization.md index b8c09d4497..ef5a544bc4 100644 --- a/docs/source/en/quantization.md +++ b/docs/source/en/quantization.md @@ -196,6 +196,45 @@ The parameter `modules_to_fuse` should include: +### Exllama-v2 support + +Recent versions of `autoawq` supports exllama-v2 kernels for faster prefill and decoding. To get started, first install the latest version of `autoawq` by running: + +```bash +pip install git+https://github.com/casper-hansen/AutoAWQ.git +``` + +Get started by passing an `AwqConfig()` with `version="exllama"`. + +```python +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, AwqConfig + +quantization_config = AwqConfig(version="exllama") + +model = AutoModelForCausalLM.from_pretrained( + "TheBloke/Mistral-7B-Instruct-v0.1-AWQ", + quantization_config=quantization_config, + device_map="auto", +) + +input_ids = torch.randint(0, 100, (1, 128), dtype=torch.long, device="cuda") +output = model(input_ids) +print(output.logits) + +tokenizer = AutoTokenizer.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.1-AWQ") +input_ids = tokenizer.encode("How to make a cake", return_tensors="pt").to(model.device) +output = model.generate(input_ids, do_sample=True, max_length=50, pad_token_id=50256) +print(tokenizer.decode(output[0], skip_special_tokens=True)) +``` + + + +Note this feature is supported on AMD GPUs. + + + + ## AutoGPTQ