Enable gptqmodel (#35012)

* gptqmodel Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix format Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * update readme Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * gptqmodel need use checkpoint_format (#1) * gptqmodel need use checkpoint_format * fix quantize * Update quantization_config.py * Update quantization_config.py * Update quantization_config.py --------- Co-authored-by: ZX-ModelCloud <zx@modelcloud.ai> Co-authored-by: Qubitium-ModelCloud <qubitium@modelcloud.ai> * Revert quantizer_gptq.py (#2) * revert quantizer_gptq.py change * pass **kwargs * limit gptqmodel and optimum version Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix format Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix warning Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix version check Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * revert unrelated changes Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * enable gptqmodel tests Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix requires gptq Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * Fix Transformer compat (#3) * revert quantizer_gptq.py change * pass **kwargs * add meta info * cleanup * cleanup * Update quantization_config.py * hf_select_quant_linear pass checkpoint_format and meta * fix GPTQTestCUDA * Update test_gptq.py * gptqmodel.hf_select_quant_linear() now does not select ExllamaV2 * cleanup * add backend * cleanup * cleanup * no need check exllama version * Update quantization_config.py * lower checkpoint_format and backend * check none * cleanup * Update quantization_config.py * fix self.use_exllama == False * spell * fix unittest * fix unittest --------- Co-authored-by: LRL <lrl@lbx.dev> Co-authored-by: Qubitium-ModelCloud <qubitium@modelcloud.ai> * fix format Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix format again Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * update gptqmodel version (#6) * update gptqmodel version * update gptqmodel version * fix unit test (#5) * update gptqmodel version * update gptqmodel version * "not self.use_exllama" is not equivalent to "self.use_exllama==False" * fix unittest * update gptqmodel version * backend is loading_attibutes (#7) * fix format and tests Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix memory check Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix device mismatch Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * fix result check Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * Update src/transformers/quantizers/quantizer_gptq.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> * Update src/transformers/quantizers/quantizer_gptq.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> * Update src/transformers/quantizers/quantizer_gptq.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> * update tests Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * review: update docs (#10) * review: update docs (#12) * review: update docs * fix typo * update tests for gptqmodel Signed-off-by: jiqing-feng <jiqing.feng@intel.com> * update document (#9) * update overview.md * cleanup * Update overview.md * Update overview.md * Update overview.md * update gptq.md * Update gptq.md * Update gptq.md * Update gptq.md * Update gptq.md * Update gptq.md * Update gptq.md --------- Co-authored-by: Qubitium-ModelCloud <qubitium@modelcloud.ai> * typo * doc note for asymmetric quant * typo with apple silicon(e) * typo for marlin * column name revert: review * doc rocm support * Update docs/source/en/quantization/gptq.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/quantization/gptq.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/quantization/gptq.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/quantization/gptq.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/quantization/overview.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/quantization/overview.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --------- Signed-off-by: jiqing-feng <jiqing.feng@intel.com> Co-authored-by: LRL-ModelCloud <165116337+LRL-ModelCloud@users.noreply.github.com> Co-authored-by: ZX-ModelCloud <zx@modelcloud.ai> Co-authored-by: Qubitium-ModelCloud <qubitium@modelcloud.ai> Co-authored-by: ZX-ModelCloud <165115237+ZX-ModelCloud@users.noreply.github.com> Co-authored-by: LRL <lrl@lbx.dev> Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> Co-authored-by: Mohamed Mekkouri <93391238+MekkCyber@users.noreply.github.com> Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
2025-01-15 21:22:49 +08:00
parent 615bf9c5e4
commit 387663e571
9 changed files with 267 additions and 87 deletions
--- a/tests/quantization/gptq/test_gptq.py
+++ b/tests/quantization/gptq/test_gptq.py
@@ -18,16 +18,17 @@ import unittest

 import pytest

-from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, GPTQConfig
 from transformers.testing_utils import (
    is_torch_available,
    require_accelerate,
-    require_auto_gptq,
+    require_gptq,
    require_optimum,
    require_torch_gpu,
    require_torch_multi_gpu,
    slow,
 )
+from transformers.utils import is_auto_gptq_available, is_gptqmodel_available, is_ipex_available


 if is_torch_available():
@@ -76,25 +77,29 @@ class GPTQConfigTest(unittest.TestCase):

@slow
@require_optimum
-@require_auto_gptq
-@require_torch_gpu
+@require_gptq
 class GPTQTest(unittest.TestCase):
    model_name = "bigscience/bloom-560m"

    input_text = "Hello my name is"

    EXPECTED_OUTPUTS = set()
+    # flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions
    EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer. I")
    EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I")
    EXPECTED_OUTPUTS.add("Hello my name is John, I am a student in the University of")
    EXPECTED_OUTPUTS.add("Hello my name is John and I am a very good looking man.")
    EXPECTED_OUTPUTS.add("Hello my name is Alyson, I am a student in the")
    EXPECTED_OUTPUTS.add("Hello my name is Alyson and I am a very sweet,")
+    EXPECTED_OUTPUTS.add("Hello my name is Aiden, I am a student at the University")
+    EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a member of the N")
+    EXPECTED_OUTPUTS.add("Hello my name is Nellie and I am a student at the")

    # this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings
    EXPECTED_RELATIVE_DIFFERENCE = 1.664253062

    bits = 4
+    sym = True
    group_size = 128
    desc_act = False
    use_exllama = False
@@ -103,7 +108,7 @@ class GPTQTest(unittest.TestCase):
        "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
    ]

-    device_map = None
+    device_map = "cpu" if is_gptqmodel_available() else None

    # called only once for all test in this class
    @classmethod
@@ -117,13 +122,15 @@ class GPTQTest(unittest.TestCase):
        cls.mem_fp16 = cls.model_fp16.get_memory_footprint()

        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)
+        cls.config = AutoConfig.from_pretrained(cls.model_name)

-        quantization_config = GPTQConfig(
+        cls.quantization_config = GPTQConfig(
            bits=cls.bits,
            dataset=cls.dataset,
            tokenizer=cls.tokenizer,
            group_size=cls.group_size,
            desc_act=cls.desc_act,
+            sym=cls.sym,
            use_exllama=cls.use_exllama,
        )

@@ -131,7 +138,7 @@ class GPTQTest(unittest.TestCase):
            cls.model_name,
            torch_dtype=torch.float16,
            device_map=cls.device_map,
-            quantization_config=quantization_config,
+            quantization_config=cls.quantization_config,
        )

    def test_memory_footprint(self):
@@ -142,7 +149,7 @@ class GPTQTest(unittest.TestCase):

        mem_quantized = self.quantized_model.get_memory_footprint()

-        self.assertAlmostEqual(self.mem_fp16 / mem_quantized, self.EXPECTED_RELATIVE_DIFFERENCE)
+        self.assertAlmostEqual(self.mem_fp16 / mem_quantized, self.EXPECTED_RELATIVE_DIFFERENCE, places=4)

    def test_device_and_dtype_assignment(self):
        r"""
@@ -150,7 +157,7 @@ class GPTQTest(unittest.TestCase):
        Checks also if other models are casted correctly.
        """
        # This should work
-        if self.device_map is None:
+        if self.device_map in (None, "cpu"):
            _ = self.quantized_model.to(0)

        with self.assertRaises(ValueError):
@@ -170,16 +177,36 @@ class GPTQTest(unittest.TestCase):
        Simple test to check if the model conversion has been done correctly by checking on
        the class type of the linear layers of the converted models
        """
-        from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
+        if is_gptqmodel_available():
+            from gptqmodel.utils.importer import hf_select_quant_linear

-        QuantLinear = dynamically_import_QuantLinear(
-            use_triton=False,
-            desc_act=self.desc_act,
-            group_size=self.group_size,
-            bits=self.bits,
-            disable_exllama=not self.use_exllama,
-            disable_exllamav2=True,
-        )
+            if hasattr(self.config, "quantization_config"):
+                checkpoint_format = self.config.quantization_config.get("checkpoint_format")
+                meta = self.config.quantization_config.get("meta")
+            else:
+                checkpoint_format = "gptq"
+                meta = None
+            QuantLinear = hf_select_quant_linear(
+                bits=self.bits,
+                group_size=self.group_size,
+                desc_act=self.desc_act,
+                sym=self.sym,
+                device_map=self.device_map,
+                checkpoint_format=checkpoint_format,
+                meta=meta,
+                backend=self.quantization_config.backend,
+            )
+        elif is_auto_gptq_available():
+            from auto_gptq.utils.import_utils import dynamically_import_QuantLinear as hf_select_quant_linear
+
+            QuantLinear = hf_select_quant_linear(
+                use_triton=False,
+                desc_act=self.desc_act,
+                group_size=self.group_size,
+                bits=self.bits,
+                disable_exllama=not self.use_exllama,
+                disable_exllamav2=True,
+            )
        self.assertTrue(self.quantized_model.transformer.h[0].mlp.dense_4h_to_h.__class__ == QuantLinear)

    def check_inference_correctness(self, model):
@@ -192,7 +219,7 @@ class GPTQTest(unittest.TestCase):
        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")

        # Check the exactness of the results
-        output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+        output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(model.device), max_new_tokens=10)

        # Get the generation
        self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
@@ -207,6 +234,8 @@ class GPTQTest(unittest.TestCase):
        if self.device_map is None:
            self.check_inference_correctness(self.quantized_model.to(0))
        else:
+            if self.device_map == "cpu" and self.quantized_model.device.type != "cpu":
+                self.quantized_model.to("cpu")
            self.check_inference_correctness(self.quantized_model)

    def test_serialization(self):
@@ -215,15 +244,28 @@ class GPTQTest(unittest.TestCase):
        """
        with tempfile.TemporaryDirectory() as tmpdirname:
            self.quantized_model.save_pretrained(tmpdirname)
-            if not self.use_exllama:
-                quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
-                    tmpdirname, quantization_config=GPTQConfig(use_exllama=False, bits=4)
-                ).to(0)
-                self.check_quantized_layers_type(quantized_model_from_saved, "cuda-old")
+            if is_auto_gptq_available() and not is_gptqmodel_available():
+                quant_type = "cuda-old" if not self.use_exllama else "exllama"
+                if not self.use_exllama:
+                    quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
+                        tmpdirname, quantization_config=GPTQConfig(use_exllama=False, bits=4)
+                    )
+                    if self.device_map != "cpu":
+                        quantized_model_from_saved = quantized_model_from_saved.to(0)
+                else:
+                    quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
+                        tmpdirname, device_map=self.device_map
+                    )
            else:
-                # we need to put it directly to the gpu. Otherwise, we won't be able to initialize the exllama kernel
-                quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map={"": 0})
-                self.check_quantized_layers_type(quantized_model_from_saved, "exllama")
+                if self.device_map == "cpu":
+                    quant_type = "ipex" if is_ipex_available() else "torch"
+                else:
+                    quant_type = "exllama"
+                quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
+                    tmpdirname, device_map=self.device_map
+                )
+
+            self.check_quantized_layers_type(quantized_model_from_saved, quant_type)
            self.check_inference_correctness(quantized_model_from_saved)

    @require_accelerate
@@ -233,20 +275,26 @@ class GPTQTest(unittest.TestCase):
        """
        with tempfile.TemporaryDirectory() as tmpdirname:
            self.quantized_model.save_pretrained(tmpdirname)
-            quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map="auto")
+            device_map = self.device_map or "auto"
+            quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=device_map)
            self.check_inference_correctness(quantized_model_from_saved)

+
+@require_torch_gpu
+class GPTQTestCUDA(GPTQTest):
+    device_map = {"": 0}
+
    def test_change_loading_attributes(self):
        """
        Test the serialization of the model and the loading of the quantized weights works with another config file
        """
        with tempfile.TemporaryDirectory() as tmpdirname:
            self.quantized_model.save_pretrained(tmpdirname)
-            if not self.use_exllama:
+            if is_auto_gptq_available() and not is_gptqmodel_available() and not self.use_exllama:
                self.check_quantized_layers_type(self.quantized_model, "cuda-old")
                # we need to put it directly to the gpu. Otherwise, we won't be able to initialize the exllama kernel
                quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
-                    tmpdirname, quantization_config=GPTQConfig(use_exllama=True, bits=4), device_map={"": 0}
+                    tmpdirname, quantization_config=GPTQConfig(use_exllama=True, bits=4), device_map=self.device_map
                )
                self.assertEqual(quantized_model_from_saved.config.quantization_config.bits, self.bits)
                self.check_quantized_layers_type(quantized_model_from_saved, "exllama")
@@ -255,20 +303,20 @@ class GPTQTest(unittest.TestCase):

@require_accelerate
@require_torch_multi_gpu
-class GPTQTestDeviceMap(GPTQTest):
+class GPTQTestDeviceMap(GPTQTestCUDA):
    device_map = "auto"


@require_accelerate
@require_torch_multi_gpu
-class GPTQTestDeviceMapExllama(GPTQTest):
+class GPTQTestDeviceMapExllama(GPTQTestCUDA):
    device_map = "auto"
    use_exllama = True


@slow
@require_optimum
-@require_auto_gptq
+@require_gptq
@require_torch_gpu
@require_accelerate
 class GPTQTestActOrderExllama(unittest.TestCase):
@@ -279,6 +327,7 @@ class GPTQTestActOrderExllama(unittest.TestCase):
    """

    EXPECTED_OUTPUTS = set()
+    # flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions
    EXPECTED_OUTPUTS.add("Hello, how are you ? I'm doing good, thanks for asking.")
    # 4bit + act_order + 128g
    model_name = "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ"
@@ -343,7 +392,7 @@ class GPTQTestActOrderExllama(unittest.TestCase):

@slow
@require_optimum
-@require_auto_gptq
+@require_gptq
@require_torch_gpu
@require_accelerate
 class GPTQTestExllamaV2(unittest.TestCase):
@@ -354,6 +403,7 @@ class GPTQTestExllamaV2(unittest.TestCase):
    """

    EXPECTED_OUTPUTS = set()
+    # flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions
    EXPECTED_OUTPUTS.add("Hello, how are you ? I'm doing good, thanks for asking.")
    # 4bit + act_order + 128g
    model_name = "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ"
@@ -374,7 +424,10 @@ class GPTQTestExllamaV2(unittest.TestCase):
        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)

    def test_quantized_layers_type(self):
-        self.assertTrue(self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE == "exllamav2")
+        self.assertEqual(
+            self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE,
+            "exllama" if is_gptqmodel_available() else "exllamav2",
+        )

    def check_inference_correctness(self, model):
        """
--- a/tests/utils/test_cache_utils.py
+++ b/tests/utils/test_cache_utils.py
@@ -21,7 +21,7 @@ from parameterized import parameterized
 from transformers import set_seed
 from transformers.testing_utils import (
    is_torch_available,
-    require_auto_gptq,
+    require_gptq,
    require_non_xpu,
    require_read_token,
    require_torch,
@@ -319,7 +319,7 @@ class CacheIntegrationTest(unittest.TestCase):
        self.assertListEqual(decoded, expected_text)

    @require_non_xpu
-    @require_auto_gptq
+    @require_gptq
    def test_sink_cache_hard(self):
        tokenizer = AutoTokenizer.from_pretrained("TheBloke/LLaMa-7B-GPTQ")
        model = AutoModelForCausalLM.from_pretrained("TheBloke/LLaMa-7B-GPTQ", device_map="auto")