Fix quantization tests (#29914)

* revert back to torch 2.1.1 * run test * switch to torch 2.2.1 * udapte dockerfile * fix awq tests * fix test * run quanto tests * update tests * split quantization tests * fix * fix again * final fix * fix report artifact * build docker again * Revert "build docker again" This reverts commit 399a5f9d9308da071d79034f238c719de0f3532e. * debug * revert * style * new notification system * testing notfication * rebuild docker * fix_prev_ci_results * typo * remove warning * fix typo * fix artifact name * debug * issue fixed * debug again * fix * fix time * test notif with faling test * typo * issues again * final fix ? * run all quantization tests again * remove name to clear space * revert modfiication done on workflow * fix * build docker * build only quant docker * fix quantization ci * fix * fix report * better quantization_matrix * add print * revert to the basic one
2024-04-09 17:10:29 +02:00
parent 6487e9b370
commit 58a939c6b7
7 changed files with 324 additions and 30 deletions
--- a/tests/quantization/autoawq/test_awq.py
+++ b/tests/quantization/autoawq/test_awq.py
@@ -101,7 +101,7 @@ class AwqTest(unittest.TestCase):

    EXPECTED_OUTPUT = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Journalism and minoring in Spanish"
    EXPECTED_OUTPUT_BF16 = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Exercise and Sport Science with a"
-
+    EXPECTED_OUTPUT_EXLLAMA = "Hello my name is Katie and I am a 20 year old student from the UK. I am currently studying for a degree in English Literature and History at the University of York. I am a very out"
    device_map = "cuda"

    # called only once for all test in this class
@@ -200,11 +200,11 @@ class AwqTest(unittest.TestCase):

        quantization_config = AwqConfig(version="exllama")
        quantized_model = AutoModelForCausalLM.from_pretrained(
-            self.model_name, quantization_config=quantization_config
-        ).to(torch_device)
+            self.model_name, quantization_config=quantization_config, device_map=torch_device
+        )

        output = quantized_model.generate(**input_ids, max_new_tokens=40)
-        self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
+        self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT_EXLLAMA)

    def test_quantized_model_no_device_map(self):
        """
@@ -239,7 +239,7 @@ class AwqTest(unittest.TestCase):

        quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto")

-        self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1, 2, 3})
+        self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})

        output = quantized_model.generate(**input_ids, max_new_tokens=40)

@@ -272,8 +272,8 @@ class AwqFusedTest(unittest.TestCase):
    model_name = "TheBloke/Mistral-7B-OpenOrca-AWQ"
    model_revision = "7048b2af77d0dd1c81b000b19d73f9cc8950b510"

-    custom_mapping_model_id = "TheBloke/Yi-34B-AWQ"
-    custom_model_revision = "f1b2cd1b7459ceecfdc1fac5bb8725f13707c589"
+    custom_mapping_model_id = "TheBloke/Mistral-7B-v0.1-AWQ"
+    custom_model_revision = "f186bcfa9edbe2a4334262ec1e67f23e53ed1ae7"

    mixtral_model_name = "casperhansen/mixtral-instruct-awq"
    mixtral_model_revision = "87dd4ec502dde74fb3a624835c776b000d190c3b"
@@ -287,8 +287,8 @@ class AwqFusedTest(unittest.TestCase):
        "You end up exactly where you started. Where are you?"
    )

-    EXPECTED_GENERATION = prompt + "\n\nThis is a classic puzzle that has been around for"
-    EXPECTED_GENERATION_CUSTOM_MODEL = "HelloWorld.java:11)\r\n\tat org"
+    EXPECTED_GENERATION = prompt + "\n\nYou are at the starting point.\n\nIf"
+    EXPECTED_GENERATION_CUSTOM_MODEL = "Hello,\n\nI have a problem with my 20"
    EXPECTED_GENERATION_MIXTRAL = prompt + " You're on the North Pole.\n\nThe"

    def tearDown(self):
@@ -423,28 +423,25 @@ class AwqFusedTest(unittest.TestCase):
            fuse_max_seq_len=512,
            modules_to_fuse={
                "attention": ["q_proj", "k_proj", "v_proj", "o_proj"],
-                "layernorm": ["ln1", "ln2", "norm"],
                "mlp": ["gate_proj", "up_proj", "down_proj"],
+                "layernorm": ["input_layernorm", "post_attention_layernorm", "norm"],
                "use_alibi": False,
-                "num_attention_heads": 56,
+                "hidden_size": 4096,
+                "num_attention_heads": 32,
                "num_key_value_heads": 8,
-                "hidden_size": 7168,
            },
        )

        model = AutoModelForCausalLM.from_pretrained(
            self.custom_mapping_model_id,
            quantization_config=quantization_config,
-            trust_remote_code=True,
            device_map="balanced",
            revision=self.custom_model_revision,
        )

        self._check_fused_modules(model)

-        tokenizer = AutoTokenizer.from_pretrained(
-            self.custom_mapping_model_id, revision=self.custom_model_revision, trust_remote_code=True
-        )
+        tokenizer = AutoTokenizer.from_pretrained(self.custom_mapping_model_id, revision=self.custom_model_revision)

        prompt = "Hello"
        inputs = tokenizer(prompt, return_tensors="pt").to(torch_device)
@@ -452,6 +449,7 @@ class AwqFusedTest(unittest.TestCase):
        outputs = model.generate(**inputs, max_new_tokens=12)
        self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION_CUSTOM_MODEL)

+    @unittest.skip("Not enough GPU memory on CI runners")
    @require_torch_multi_gpu
    def test_generation_mixtral_fused(self):
        """