Enable gptqmodel (#35012)
* gptqmodel Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix format Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * update readme Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * gptqmodel need use checkpoint_format (#1) * gptqmodel need use checkpoint_format * fix quantize * Update quantization_config.py * Update quantization_config.py * Update quantization_config.py --------- Co-authored-by: ZX-ModelCloud <zx@modelcloud.ai> Co-authored-by: Qubitium-ModelCloud <qubitium@modelcloud.ai> * Revert quantizer_gptq.py (#2) * revert quantizer_gptq.py change * pass **kwargs * limit gptqmodel and optimum version Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix format Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix warning Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix version check Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * revert unrelated changes Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * enable gptqmodel tests Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix requires gptq Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * Fix Transformer compat (#3) * revert quantizer_gptq.py change * pass **kwargs * add meta info * cleanup * cleanup * Update quantization_config.py * hf_select_quant_linear pass checkpoint_format and meta * fix GPTQTestCUDA * Update test_gptq.py * gptqmodel.hf_select_quant_linear() now does not select ExllamaV2 * cleanup * add backend * cleanup * cleanup * no need check exllama version * Update quantization_config.py * lower checkpoint_format and backend * check none * cleanup * Update quantization_config.py * fix self.use_exllama == False * spell * fix unittest * fix unittest --------- Co-authored-by: LRL <lrl@lbx.dev> Co-authored-by: Qubitium-ModelCloud <qubitium@modelcloud.ai> * fix format Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix format again Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * update gptqmodel version (#6) * update gptqmodel version * update gptqmodel version * fix unit test (#5) * update gptqmodel version * update gptqmodel version * "not self.use_exllama" is not equivalent to "self.use_exllama==False" * fix unittest * update gptqmodel version * backend is loading_attibutes (#7) * fix format and tests Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix memory check Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix device mismatch Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix result check Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * Update src/transformers/quantizers/quantizer_gptq.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> * Update src/transformers/quantizers/quantizer_gptq.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> * Update src/transformers/quantizers/quantizer_gptq.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> * update tests Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * review: update docs (#10) * review: update docs (#12) * review: update docs * fix typo * update tests for gptqmodel Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * update document (#9) * update overview.md * cleanup * Update overview.md * Update overview.md * Update overview.md * update gptq.md * Update gptq.md * Update gptq.md * Update gptq.md * Update gptq.md * Update gptq.md * Update gptq.md --------- Co-authored-by: Qubitium-ModelCloud <qubitium@modelcloud.ai> * typo * doc note for asymmetric quant * typo with apple silicon(e) * typo for marlin * column name revert: review * doc rocm support * Update docs/source/en/quantization/gptq.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/quantization/gptq.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/quantization/gptq.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/quantization/gptq.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/quantization/overview.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/quantization/overview.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --------- Signed-off-by: jiqing-feng <jiqing.feng@intel.com> Co-authored-by: LRL-ModelCloud <165116337+LRL-ModelCloud@users.noreply.github.com> Co-authored-by: ZX-ModelCloud <zx@modelcloud.ai> Co-authored-by: Qubitium-ModelCloud <qubitium@modelcloud.ai> Co-authored-by: ZX-ModelCloud <165115237+ZX-ModelCloud@users.noreply.github.com> Co-authored-by: LRL <lrl@lbx.dev> Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> Co-authored-by: Mohamed Mekkouri <93391238+MekkCyber@users.noreply.github.com> Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
This commit is contained in:
@@ -18,16 +18,17 @@ import unittest
|
||||
|
||||
import pytest
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
|
||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, GPTQConfig
|
||||
from transformers.testing_utils import (
|
||||
is_torch_available,
|
||||
require_accelerate,
|
||||
require_auto_gptq,
|
||||
require_gptq,
|
||||
require_optimum,
|
||||
require_torch_gpu,
|
||||
require_torch_multi_gpu,
|
||||
slow,
|
||||
)
|
||||
from transformers.utils import is_auto_gptq_available, is_gptqmodel_available, is_ipex_available
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
@@ -76,25 +77,29 @@ class GPTQConfigTest(unittest.TestCase):
|
||||
|
||||
@slow
|
||||
@require_optimum
|
||||
@require_auto_gptq
|
||||
@require_torch_gpu
|
||||
@require_gptq
|
||||
class GPTQTest(unittest.TestCase):
|
||||
model_name = "bigscience/bloom-560m"
|
||||
|
||||
input_text = "Hello my name is"
|
||||
|
||||
EXPECTED_OUTPUTS = set()
|
||||
# flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions
|
||||
EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer. I")
|
||||
EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I")
|
||||
EXPECTED_OUTPUTS.add("Hello my name is John, I am a student in the University of")
|
||||
EXPECTED_OUTPUTS.add("Hello my name is John and I am a very good looking man.")
|
||||
EXPECTED_OUTPUTS.add("Hello my name is Alyson, I am a student in the")
|
||||
EXPECTED_OUTPUTS.add("Hello my name is Alyson and I am a very sweet,")
|
||||
EXPECTED_OUTPUTS.add("Hello my name is Aiden, I am a student at the University")
|
||||
EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a member of the N")
|
||||
EXPECTED_OUTPUTS.add("Hello my name is Nellie and I am a student at the")
|
||||
|
||||
# this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings
|
||||
EXPECTED_RELATIVE_DIFFERENCE = 1.664253062
|
||||
|
||||
bits = 4
|
||||
sym = True
|
||||
group_size = 128
|
||||
desc_act = False
|
||||
use_exllama = False
|
||||
@@ -103,7 +108,7 @@ class GPTQTest(unittest.TestCase):
|
||||
"auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
|
||||
]
|
||||
|
||||
device_map = None
|
||||
device_map = "cpu" if is_gptqmodel_available() else None
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
@@ -117,13 +122,15 @@ class GPTQTest(unittest.TestCase):
|
||||
cls.mem_fp16 = cls.model_fp16.get_memory_footprint()
|
||||
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)
|
||||
cls.config = AutoConfig.from_pretrained(cls.model_name)
|
||||
|
||||
quantization_config = GPTQConfig(
|
||||
cls.quantization_config = GPTQConfig(
|
||||
bits=cls.bits,
|
||||
dataset=cls.dataset,
|
||||
tokenizer=cls.tokenizer,
|
||||
group_size=cls.group_size,
|
||||
desc_act=cls.desc_act,
|
||||
sym=cls.sym,
|
||||
use_exllama=cls.use_exllama,
|
||||
)
|
||||
|
||||
@@ -131,7 +138,7 @@ class GPTQTest(unittest.TestCase):
|
||||
cls.model_name,
|
||||
torch_dtype=torch.float16,
|
||||
device_map=cls.device_map,
|
||||
quantization_config=quantization_config,
|
||||
quantization_config=cls.quantization_config,
|
||||
)
|
||||
|
||||
def test_memory_footprint(self):
|
||||
@@ -142,7 +149,7 @@ class GPTQTest(unittest.TestCase):
|
||||
|
||||
mem_quantized = self.quantized_model.get_memory_footprint()
|
||||
|
||||
self.assertAlmostEqual(self.mem_fp16 / mem_quantized, self.EXPECTED_RELATIVE_DIFFERENCE)
|
||||
self.assertAlmostEqual(self.mem_fp16 / mem_quantized, self.EXPECTED_RELATIVE_DIFFERENCE, places=4)
|
||||
|
||||
def test_device_and_dtype_assignment(self):
|
||||
r"""
|
||||
@@ -150,7 +157,7 @@ class GPTQTest(unittest.TestCase):
|
||||
Checks also if other models are casted correctly.
|
||||
"""
|
||||
# This should work
|
||||
if self.device_map is None:
|
||||
if self.device_map in (None, "cpu"):
|
||||
_ = self.quantized_model.to(0)
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
@@ -170,16 +177,36 @@ class GPTQTest(unittest.TestCase):
|
||||
Simple test to check if the model conversion has been done correctly by checking on
|
||||
the class type of the linear layers of the converted models
|
||||
"""
|
||||
from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
|
||||
if is_gptqmodel_available():
|
||||
from gptqmodel.utils.importer import hf_select_quant_linear
|
||||
|
||||
QuantLinear = dynamically_import_QuantLinear(
|
||||
use_triton=False,
|
||||
desc_act=self.desc_act,
|
||||
group_size=self.group_size,
|
||||
bits=self.bits,
|
||||
disable_exllama=not self.use_exllama,
|
||||
disable_exllamav2=True,
|
||||
)
|
||||
if hasattr(self.config, "quantization_config"):
|
||||
checkpoint_format = self.config.quantization_config.get("checkpoint_format")
|
||||
meta = self.config.quantization_config.get("meta")
|
||||
else:
|
||||
checkpoint_format = "gptq"
|
||||
meta = None
|
||||
QuantLinear = hf_select_quant_linear(
|
||||
bits=self.bits,
|
||||
group_size=self.group_size,
|
||||
desc_act=self.desc_act,
|
||||
sym=self.sym,
|
||||
device_map=self.device_map,
|
||||
checkpoint_format=checkpoint_format,
|
||||
meta=meta,
|
||||
backend=self.quantization_config.backend,
|
||||
)
|
||||
elif is_auto_gptq_available():
|
||||
from auto_gptq.utils.import_utils import dynamically_import_QuantLinear as hf_select_quant_linear
|
||||
|
||||
QuantLinear = hf_select_quant_linear(
|
||||
use_triton=False,
|
||||
desc_act=self.desc_act,
|
||||
group_size=self.group_size,
|
||||
bits=self.bits,
|
||||
disable_exllama=not self.use_exllama,
|
||||
disable_exllamav2=True,
|
||||
)
|
||||
self.assertTrue(self.quantized_model.transformer.h[0].mlp.dense_4h_to_h.__class__ == QuantLinear)
|
||||
|
||||
def check_inference_correctness(self, model):
|
||||
@@ -192,7 +219,7 @@ class GPTQTest(unittest.TestCase):
|
||||
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
|
||||
|
||||
# Check the exactness of the results
|
||||
output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
|
||||
output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(model.device), max_new_tokens=10)
|
||||
|
||||
# Get the generation
|
||||
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
|
||||
@@ -207,6 +234,8 @@ class GPTQTest(unittest.TestCase):
|
||||
if self.device_map is None:
|
||||
self.check_inference_correctness(self.quantized_model.to(0))
|
||||
else:
|
||||
if self.device_map == "cpu" and self.quantized_model.device.type != "cpu":
|
||||
self.quantized_model.to("cpu")
|
||||
self.check_inference_correctness(self.quantized_model)
|
||||
|
||||
def test_serialization(self):
|
||||
@@ -215,15 +244,28 @@ class GPTQTest(unittest.TestCase):
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
if not self.use_exllama:
|
||||
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
|
||||
tmpdirname, quantization_config=GPTQConfig(use_exllama=False, bits=4)
|
||||
).to(0)
|
||||
self.check_quantized_layers_type(quantized_model_from_saved, "cuda-old")
|
||||
if is_auto_gptq_available() and not is_gptqmodel_available():
|
||||
quant_type = "cuda-old" if not self.use_exllama else "exllama"
|
||||
if not self.use_exllama:
|
||||
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
|
||||
tmpdirname, quantization_config=GPTQConfig(use_exllama=False, bits=4)
|
||||
)
|
||||
if self.device_map != "cpu":
|
||||
quantized_model_from_saved = quantized_model_from_saved.to(0)
|
||||
else:
|
||||
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
|
||||
tmpdirname, device_map=self.device_map
|
||||
)
|
||||
else:
|
||||
# we need to put it directly to the gpu. Otherwise, we won't be able to initialize the exllama kernel
|
||||
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map={"": 0})
|
||||
self.check_quantized_layers_type(quantized_model_from_saved, "exllama")
|
||||
if self.device_map == "cpu":
|
||||
quant_type = "ipex" if is_ipex_available() else "torch"
|
||||
else:
|
||||
quant_type = "exllama"
|
||||
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
|
||||
tmpdirname, device_map=self.device_map
|
||||
)
|
||||
|
||||
self.check_quantized_layers_type(quantized_model_from_saved, quant_type)
|
||||
self.check_inference_correctness(quantized_model_from_saved)
|
||||
|
||||
@require_accelerate
|
||||
@@ -233,20 +275,26 @@ class GPTQTest(unittest.TestCase):
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map="auto")
|
||||
device_map = self.device_map or "auto"
|
||||
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=device_map)
|
||||
self.check_inference_correctness(quantized_model_from_saved)
|
||||
|
||||
|
||||
@require_torch_gpu
|
||||
class GPTQTestCUDA(GPTQTest):
|
||||
device_map = {"": 0}
|
||||
|
||||
def test_change_loading_attributes(self):
|
||||
"""
|
||||
Test the serialization of the model and the loading of the quantized weights works with another config file
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
if not self.use_exllama:
|
||||
if is_auto_gptq_available() and not is_gptqmodel_available() and not self.use_exllama:
|
||||
self.check_quantized_layers_type(self.quantized_model, "cuda-old")
|
||||
# we need to put it directly to the gpu. Otherwise, we won't be able to initialize the exllama kernel
|
||||
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
|
||||
tmpdirname, quantization_config=GPTQConfig(use_exllama=True, bits=4), device_map={"": 0}
|
||||
tmpdirname, quantization_config=GPTQConfig(use_exllama=True, bits=4), device_map=self.device_map
|
||||
)
|
||||
self.assertEqual(quantized_model_from_saved.config.quantization_config.bits, self.bits)
|
||||
self.check_quantized_layers_type(quantized_model_from_saved, "exllama")
|
||||
@@ -255,20 +303,20 @@ class GPTQTest(unittest.TestCase):
|
||||
|
||||
@require_accelerate
|
||||
@require_torch_multi_gpu
|
||||
class GPTQTestDeviceMap(GPTQTest):
|
||||
class GPTQTestDeviceMap(GPTQTestCUDA):
|
||||
device_map = "auto"
|
||||
|
||||
|
||||
@require_accelerate
|
||||
@require_torch_multi_gpu
|
||||
class GPTQTestDeviceMapExllama(GPTQTest):
|
||||
class GPTQTestDeviceMapExllama(GPTQTestCUDA):
|
||||
device_map = "auto"
|
||||
use_exllama = True
|
||||
|
||||
|
||||
@slow
|
||||
@require_optimum
|
||||
@require_auto_gptq
|
||||
@require_gptq
|
||||
@require_torch_gpu
|
||||
@require_accelerate
|
||||
class GPTQTestActOrderExllama(unittest.TestCase):
|
||||
@@ -279,6 +327,7 @@ class GPTQTestActOrderExllama(unittest.TestCase):
|
||||
"""
|
||||
|
||||
EXPECTED_OUTPUTS = set()
|
||||
# flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions
|
||||
EXPECTED_OUTPUTS.add("Hello, how are you ? I'm doing good, thanks for asking.")
|
||||
# 4bit + act_order + 128g
|
||||
model_name = "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ"
|
||||
@@ -343,7 +392,7 @@ class GPTQTestActOrderExllama(unittest.TestCase):
|
||||
|
||||
@slow
|
||||
@require_optimum
|
||||
@require_auto_gptq
|
||||
@require_gptq
|
||||
@require_torch_gpu
|
||||
@require_accelerate
|
||||
class GPTQTestExllamaV2(unittest.TestCase):
|
||||
@@ -354,6 +403,7 @@ class GPTQTestExllamaV2(unittest.TestCase):
|
||||
"""
|
||||
|
||||
EXPECTED_OUTPUTS = set()
|
||||
# flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions
|
||||
EXPECTED_OUTPUTS.add("Hello, how are you ? I'm doing good, thanks for asking.")
|
||||
# 4bit + act_order + 128g
|
||||
model_name = "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ"
|
||||
@@ -374,7 +424,10 @@ class GPTQTestExllamaV2(unittest.TestCase):
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)
|
||||
|
||||
def test_quantized_layers_type(self):
|
||||
self.assertTrue(self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE == "exllamav2")
|
||||
self.assertEqual(
|
||||
self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE,
|
||||
"exllama" if is_gptqmodel_available() else "exllamav2",
|
||||
)
|
||||
|
||||
def check_inference_correctness(self, model):
|
||||
"""
|
||||
|
||||
@@ -21,7 +21,7 @@ from parameterized import parameterized
|
||||
from transformers import set_seed
|
||||
from transformers.testing_utils import (
|
||||
is_torch_available,
|
||||
require_auto_gptq,
|
||||
require_gptq,
|
||||
require_non_xpu,
|
||||
require_read_token,
|
||||
require_torch,
|
||||
@@ -319,7 +319,7 @@ class CacheIntegrationTest(unittest.TestCase):
|
||||
self.assertListEqual(decoded, expected_text)
|
||||
|
||||
@require_non_xpu
|
||||
@require_auto_gptq
|
||||
@require_gptq
|
||||
def test_sink_cache_hard(self):
|
||||
tokenizer = AutoTokenizer.from_pretrained("TheBloke/LLaMa-7B-GPTQ")
|
||||
model = AutoModelForCausalLM.from_pretrained("TheBloke/LLaMa-7B-GPTQ", device_map="auto")
|
||||
|
||||
Reference in New Issue
Block a user