From b916efcb3c615132d6278ccc46c48cf7e5ea7792 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Fri, 4 Oct 2024 22:25:10 +0800 Subject: [PATCH] Enables CPU AWQ model with IPEX version. (#33460) * enable cpu awq ipex linear * add doc for cpu awq with ipex kernel * add tests for cpu awq * fix code style * fix doc and tests * Update docs/source/en/quantization/awq.md Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> * Update tests/quantization/autoawq/test_awq.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> * fix comments * fix log * fix log * fix style --------- Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> --- docs/source/en/quantization/awq.md | 41 ++++++++++++++++++ src/transformers/integrations/__init__.py | 2 + src/transformers/integrations/awq.py | 29 +++++++++++-- src/transformers/quantizers/quantizer_awq.py | 42 +++++++++++++------ src/transformers/utils/quantization_config.py | 15 ++++--- tests/quantization/autoawq/test_awq.py | 29 +++++++++++++ 6 files changed, 138 insertions(+), 20 deletions(-) diff --git a/docs/source/en/quantization/awq.md b/docs/source/en/quantization/awq.md index 3c94bcca15..ca26844edd 100644 --- a/docs/source/en/quantization/awq.md +++ b/docs/source/en/quantization/awq.md @@ -230,3 +230,44 @@ print(tokenizer.decode(output[0], skip_special_tokens=True)) Note this feature is supported on AMD GPUs. + + +## CPU support + +Recent versions of `autoawq` supports CPU with ipex op optimizations. To get started, first install the latest version of `autoawq` by running: + +```bash +pip install intel-extension-for-pytorch +pip install git+https://github.com/casper-hansen/AutoAWQ.git +``` + +Get started by passing an `AwqConfig()` with `version="ipex"`. + +```python +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, AwqConfig + +quantization_config = AwqConfig(version="ipex") + +model = AutoModelForCausalLM.from_pretrained( + "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", + quantization_config=quantization_config, + device_map="cpu", +) + +input_ids = torch.randint(0, 100, (1, 128), dtype=torch.long, device="cpu") +output = model(input_ids) +print(output.logits) + +tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ") +input_ids = tokenizer.encode("How to make a cake", return_tensors="pt") +pad_token_id = tokenizer.eos_token_id +output = model.generate(input_ids, do_sample=True, max_length=50, pad_token_id=pad_token_id) +print(tokenizer.decode(output[0], skip_special_tokens=True)) +``` + + + +Note this feature is supported on Intel CPUs. + + \ No newline at end of file diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py index 00bbcf2d06..b172c78513 100755 --- a/src/transformers/integrations/__init__.py +++ b/src/transformers/integrations/__init__.py @@ -21,6 +21,7 @@ _import_structure = { "awq": [ "fuse_awq_modules", "post_init_awq_exllama_modules", + "post_init_awq_ipex_modules", "replace_quantization_scales", "replace_with_awq_linear", ], @@ -115,6 +116,7 @@ if TYPE_CHECKING: from .awq import ( fuse_awq_modules, post_init_awq_exllama_modules, + post_init_awq_ipex_modules, replace_quantization_scales, replace_with_awq_linear, ) diff --git a/src/transformers/integrations/awq.py b/src/transformers/integrations/awq.py index 18e1931d07..a945b7f781 100644 --- a/src/transformers/integrations/awq.py +++ b/src/transformers/integrations/awq.py @@ -145,6 +145,10 @@ def replace_with_awq_linear( target_cls = WQLinear_ExllamaV2 else: raise ValueError(f"Unrecognized Exllama version: {quantization_config.exllama_config['version']}") + elif quantization_config.version == AWQLinearVersion.IPEX: + from awq.modules.linear.gemm_ipex import WQLinear_IPEX + + target_cls = WQLinear_IPEX else: raise ValueError(f"Unrecognized AWQ version: {quantization_config.version}") else: @@ -266,8 +270,11 @@ def fuse_awq_modules(model, quantization_config): # Replace layer norms _fuse_awq_layernorm(modules_to_fuse["layernorm"], module, FasterTransformerRMSNorm) - # Replace MLP layers - _fuse_awq_mlp(model, name, modules_to_fuse["mlp"], module, QuantFusedMLP) + # Replace MLP layers if awq version is not ipex. + if quantization_config.version != "ipex": + _fuse_awq_mlp(model, name, modules_to_fuse["mlp"], module, QuantFusedMLP) + else: + logger.info("The IPEX version AWQ does not support fuse mlp for now.") # Replace attention layers attention_has_been_fused = _fuse_awq_attention_layers( @@ -372,7 +379,7 @@ def _fuse_awq_attention_layers(model, module, modules_to_fuse, current_module_na The `QuantAttentionFused` class as it only supports that class for now. """ - from awq.modules.linear import WQLinear_GEMM, WQLinear_GEMV + from awq.modules.linear import WQLinear_GEMM, WQLinear_GEMV, WQLinear_IPEX module_has_been_fused = False @@ -389,6 +396,9 @@ def _fuse_awq_attention_layers(model, module, modules_to_fuse, current_module_na elif isinstance(q_proj, WQLinear_GEMM): linear_target_cls = WQLinear_GEMM cat_dim = 1 + elif isinstance(q_proj, WQLinear_IPEX): + linear_target_cls = WQLinear_IPEX + cat_dim = 1 else: raise ValueError("Unsupported q_proj type: {type(q_proj)}") @@ -466,3 +476,16 @@ def post_init_awq_exllama_modules(model, exllama_config): raise ValueError(f"Unrecognized Exllama version: {exllama_config['version']}") return model + + +def post_init_awq_ipex_modules(model): + """ + Runs post init for IPEX layers which performs: + - Weights packing, reordering and repacking + """ + + from awq.modules.linear.gemm_ipex import ipex_post_init + + model = ipex_post_init(model) + + return model diff --git a/src/transformers/quantizers/quantizer_awq.py b/src/transformers/quantizers/quantizer_awq.py index 9c66ba385a..3df4fd5147 100644 --- a/src/transformers/quantizers/quantizer_awq.py +++ b/src/transformers/quantizers/quantizer_awq.py @@ -46,26 +46,39 @@ class AwqQuantizer(HfQuantizer): super().__init__(quantization_config, **kwargs) def validate_environment(self, device_map, **kwargs): - if not torch.cuda.is_available(): - raise RuntimeError("GPU is required to run AWQ quantized model.") - if not is_auto_awq_available(): raise ImportError("Loading an AWQ quantized model requires auto-awq library (`pip install autoawq`)") if not is_accelerate_available(): raise ImportError("Loading an AWQ quantized model requires accelerate (`pip install accelerate`)") - if device_map is None: - logger.warning_once( - "You have loaded an AWQ model on CPU and have a CUDA device available, make sure to set " - "your model on a GPU device in order to run your model." - ) - elif device_map is not None: - if isinstance(device_map, dict) and ("cpu" in device_map.values() or "disk" in device_map.values()): + if self.quantization_config.version == AWQLinearVersion.IPEX: + if ( + device_map is not None + and isinstance(device_map, dict) + and (torch.device("cpu") not in device_map.values() or len(device_map.values()) > 1) + ): raise ValueError( - "You are attempting to load an AWQ model with a device_map that contains a CPU or disk device." - " This is not supported. Please remove the CPU or disk device from the device_map." + "You are attempting to load an IPEX version AWQ model with a device_map that contains more than CPU." + " This is not supported. Please make sure only cpu in the device_map." ) + else: + if not torch.cuda.is_available(): + raise RuntimeError( + "GPU is required to run AWQ quantized model. You can use IPEX version AWQ if you have an Intel CPU" + ) + + if device_map is None: + logger.warning_once( + "You have loaded an AWQ model on CPU and have a CUDA device available, make sure to set " + "your model on a GPU device in order to run your model." + ) + elif device_map is not None: + if isinstance(device_map, dict) and ("cpu" in device_map.values() or "disk" in device_map.values()): + raise ValueError( + "You are attempting to load an AWQ model with a device_map that contains a CPU or disk device." + " This is not supported. Please remove the CPU or disk device from the device_map." + ) def update_torch_dtype(self, torch_dtype): if torch_dtype is None: @@ -106,6 +119,11 @@ class AwqQuantizer(HfQuantizer): model = post_init_awq_exllama_modules(model, self.quantization_config.exllama_config) + if self.quantization_config.version == AWQLinearVersion.IPEX: + from ..integrations import post_init_awq_ipex_modules + + model = post_init_awq_ipex_modules(model) + def is_serializable(self, safe_serialization=None): # AWQ through auto-awq has been always serializable, except if the model is fused. if self.quantization_config.do_fuse: diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index 0223144ac9..aa15e096f5 100755 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -51,6 +51,7 @@ class AWQLinearVersion(str, Enum): GEMM = "gemm" GEMV = "gemv" EXLLAMA = "exllama" + IPEX = "ipex" @staticmethod def from_str(version: str): @@ -61,6 +62,8 @@ class AWQLinearVersion(str, Enum): return AWQLinearVersion.GEMV elif version == "exllama": return AWQLinearVersion.EXLLAMA + elif version == "ipex": + return AWQLinearVersion.IPEX else: raise ValueError(f"Unknown AWQLinearVersion {version}") @@ -830,18 +833,20 @@ class AwqConfig(QuantizationConfigMixin): r""" Safety checker that arguments are correct """ - if not torch.cuda.is_available(): - raise ValueError("AWQ is only available on GPU") - if self.backend not in [AwqBackendPackingMethod.AUTOAWQ, AwqBackendPackingMethod.LLMAWQ]: raise ValueError( f"Only supported quantization backends in {AwqBackendPackingMethod.AUTOAWQ} and {AwqBackendPackingMethod.LLMAWQ} - not recognized backend {self.backend}" ) self.version = AWQLinearVersion.from_str(self.version) - if self.version not in [AWQLinearVersion.GEMM, AWQLinearVersion.GEMV, AWQLinearVersion.EXLLAMA]: + if self.version not in [ + AWQLinearVersion.GEMM, + AWQLinearVersion.GEMV, + AWQLinearVersion.EXLLAMA, + AWQLinearVersion.IPEX, + ]: raise ValueError( - f"Only supported versions are in [AWQLinearVersion.GEMM, AWQLinearVersion.GEMV, AWQLinearVersion.EXLLAMA] - not recognized version {self.version}" + f"Only supported versions are in [AWQLinearVersion.GEMM, AWQLinearVersion.GEMV, AWQLinearVersion.EXLLAMA, AWQLinearVersion.IPEX] - not recognized version {self.version}" ) if self.backend == AwqBackendPackingMethod.LLMAWQ: diff --git a/tests/quantization/autoawq/test_awq.py b/tests/quantization/autoawq/test_awq.py index 58678ade57..780efe8aa4 100644 --- a/tests/quantization/autoawq/test_awq.py +++ b/tests/quantization/autoawq/test_awq.py @@ -21,6 +21,7 @@ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AwqCon from transformers.testing_utils import ( require_accelerate, require_auto_awq, + require_intel_extension_for_pytorch, require_torch_gpu, require_torch_multi_gpu, slow, @@ -490,3 +491,31 @@ class AwqScaleTest(unittest.TestCase): "TechxGenus/starcoder2-3b-AWQ", torch_dtype=torch.float16, device_map="cuda" ) self.assertTrue(isinstance(quantized_model.model.layers[0].mlp.act, ScaledActivation)) + + +@slow +@require_auto_awq +@require_accelerate +@require_intel_extension_for_pytorch +class AwqIPEXTest(unittest.TestCase): + def test_quantized_model_ipex(self): + """ + Simple test that checks if the quantized model is working properly with ipex backend + """ + quantization_config = AwqConfig(version="ipex") + + model = AutoModelForCausalLM.from_pretrained( + "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", + quantization_config=quantization_config, + device_map="cpu", + ) + tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ") + input_ids = tokenizer.encode("How to make a cake", return_tensors="pt") + pad_token_id = tokenizer.eos_token_id + output = model.generate(input_ids, do_sample=False, max_length=20, pad_token_id=pad_token_id) + print(tokenizer.decode(output[0], skip_special_tokens=True)) + + expected_output = ( + "How to make a cake with a round tin?\nHow to make a cake with a round tin?\n1. Preheat the oven to 180°" + ) + self.assertIn(tokenizer.decode(output[0], skip_special_tokens=True), expected_output)