Exllama kernels support for AWQ models (#28634)
* added exllama kernels support for awq models * doc * style * Update src/transformers/modeling_utils.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> * refactor * moved exllama post init to after device dispatching * bump autoawq version * added exllama test * style * configurable exllama kernels * copy exllama_config from gptq * moved exllama version check to post init * moved to quantization dockerfile --------- Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
81c8191b46
commit
4fc708f98c
@@ -192,6 +192,20 @@ class AwqTest(unittest.TestCase):
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=40)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT_BF16)
|
||||
|
||||
def test_quantized_model_exllama(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly with exllama backend
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
quantization_config = AwqConfig(version="exllama")
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name, quantization_config=quantization_config
|
||||
).to(torch_device)
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=40)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
def test_quantized_model_no_device_map(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly
|
||||
|
||||
Reference in New Issue
Block a user