From 11afab19c0e4b652855f9ed7f82aa010c4f14754 Mon Sep 17 00:00:00 2001 From: Fanli Lin Date: Wed, 12 Feb 2025 02:35:28 +0800 Subject: [PATCH] [docs] update awq doc (#36079) * update awq doc * Update docs/source/en/quantization/awq.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/quantization/awq.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/quantization/awq.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/quantization/awq.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * add note for inference --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/quantization/awq.md | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/docs/source/en/quantization/awq.md b/docs/source/en/quantization/awq.md index ca26844edd..f581c16139 100644 --- a/docs/source/en/quantization/awq.md +++ b/docs/source/en/quantization/awq.md @@ -31,6 +31,8 @@ Make sure you have autoawq installed: ```bash pip install autoawq ``` +> [!WARNING] +> AutoAWQ downgrades Transformers to version 4.47.1. If you want to do inference with AutoAWQ, you may need to reinstall your Transformers' version after installing AutoAWQ. AWQ-quantized models can be identified by checking the `quantization_config` attribute in the model's [config.json](https://huggingface.co/TheBloke/zephyr-7B-alpha-AWQ/blob/main/config.json) file: @@ -59,13 +61,14 @@ A quantized model is loaded with the [`~PreTrainedModel.from_pretrained`] method from transformers import AutoModelForCausalLM, AutoTokenizer model_id = "TheBloke/zephyr-7B-alpha-AWQ" -model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda:0") +model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto") ``` Loading an AWQ-quantized model automatically sets other weights to fp16 by default for performance reasons. If you want to load these other weights in a different format, use the `torch_dtype` parameter: ```py from transformers import AutoModelForCausalLM, AutoTokenizer +import torch model_id = "TheBloke/zephyr-7B-alpha-AWQ" model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32) @@ -175,7 +178,7 @@ quantization_config = AwqConfig( } ) -model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config).to(0) +model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, trust_remote_code=True).to(0) ``` The parameter `modules_to_fuse` should include: @@ -232,12 +235,12 @@ Note this feature is supported on AMD GPUs. -## CPU support +## Intel CPU/GPU support -Recent versions of `autoawq` supports CPU with ipex op optimizations. To get started, first install the latest version of `autoawq` by running: +Recent versions of autoawq supports Intel CPU/GPU with IPEX op optimizations. To get started, install the latest version of autoawq. ```bash -pip install intel-extension-for-pytorch +pip install intel-extension-for-pytorch # for IPEX-GPU refer to https://intel.github.io/intel-extension-for-pytorch/xpu/2.5.10+xpu/ pip install git+https://github.com/casper-hansen/AutoAWQ.git ``` @@ -247,20 +250,21 @@ Get started by passing an `AwqConfig()` with `version="ipex"`. import torch from transformers import AutoModelForCausalLM, AutoTokenizer, AwqConfig +device = "cpu" # set to "xpu" for Intel GPU quantization_config = AwqConfig(version="ipex") model = AutoModelForCausalLM.from_pretrained( "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", quantization_config=quantization_config, - device_map="cpu", + device_map=device, ) -input_ids = torch.randint(0, 100, (1, 128), dtype=torch.long, device="cpu") +input_ids = torch.randint(0, 100, (1, 128), dtype=torch.long, device=device) output = model(input_ids) print(output.logits) tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ") -input_ids = tokenizer.encode("How to make a cake", return_tensors="pt") +input_ids = tokenizer.encode("How to make a cake", return_tensors="pt").to(device) pad_token_id = tokenizer.eos_token_id output = model.generate(input_ids, do_sample=True, max_length=50, pad_token_id=pad_token_id) print(tokenizer.decode(output[0], skip_special_tokens=True)) @@ -268,6 +272,6 @@ print(tokenizer.decode(output[0], skip_special_tokens=True)) -Note this feature is supported on Intel CPUs. +This feature is supported on Intel CPUs/GPUs. - \ No newline at end of file +