Exllama kernels support for AWQ models (#28634)

* added exllama kernels support for awq models * doc * style * Update src/transformers/modeling_utils.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> * refactor * moved exllama post init to after device dispatching * bump autoawq version * added exllama test * style * configurable exllama kernels * copy exllama_config from gptq * moved exllama version check to post init * moved to quantization dockerfile --------- Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
2024-03-05 03:22:48 +01:00
parent 81c8191b46
commit 4fc708f98c
6 changed files with 127 additions and 13 deletions
--- a/tests/quantization/autoawq/test_awq.py
+++ b/tests/quantization/autoawq/test_awq.py
@@ -192,6 +192,20 @@ class AwqTest(unittest.TestCase):
        output = quantized_model.generate(**input_ids, max_new_tokens=40)
        self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT_BF16)

+    def test_quantized_model_exllama(self):
+        """
+        Simple test that checks if the quantized model is working properly with exllama backend
+        """
+        input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
+
+        quantization_config = AwqConfig(version="exllama")
+        quantized_model = AutoModelForCausalLM.from_pretrained(
+            self.model_name, quantization_config=quantization_config
+        ).to(torch_device)
+
+        output = quantized_model.generate(**input_ids, max_new_tokens=40)
+        self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
+
    def test_quantized_model_no_device_map(self):
        """
        Simple test that checks if the quantized model is working properly