Fix quantization tests (#29914)
* revert back to torch 2.1.1 * run test * switch to torch 2.2.1 * udapte dockerfile * fix awq tests * fix test * run quanto tests * update tests * split quantization tests * fix * fix again * final fix * fix report artifact * build docker again * Revert "build docker again" This reverts commit 399a5f9d9308da071d79034f238c719de0f3532e. * debug * revert * style * new notification system * testing notfication * rebuild docker * fix_prev_ci_results * typo * remove warning * fix typo * fix artifact name * debug * issue fixed * debug again * fix * fix time * test notif with faling test * typo * issues again * final fix ? * run all quantization tests again * remove name to clear space * revert modfiication done on workflow * fix * build docker * build only quant docker * fix quantization ci * fix * fix report * better quantization_matrix * add print * revert to the basic one
This commit is contained in:
@@ -101,7 +101,7 @@ class AwqTest(unittest.TestCase):
|
||||
|
||||
EXPECTED_OUTPUT = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Journalism and minoring in Spanish"
|
||||
EXPECTED_OUTPUT_BF16 = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Exercise and Sport Science with a"
|
||||
|
||||
EXPECTED_OUTPUT_EXLLAMA = "Hello my name is Katie and I am a 20 year old student from the UK. I am currently studying for a degree in English Literature and History at the University of York. I am a very out"
|
||||
device_map = "cuda"
|
||||
|
||||
# called only once for all test in this class
|
||||
@@ -200,11 +200,11 @@ class AwqTest(unittest.TestCase):
|
||||
|
||||
quantization_config = AwqConfig(version="exllama")
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name, quantization_config=quantization_config
|
||||
).to(torch_device)
|
||||
self.model_name, quantization_config=quantization_config, device_map=torch_device
|
||||
)
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=40)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT_EXLLAMA)
|
||||
|
||||
def test_quantized_model_no_device_map(self):
|
||||
"""
|
||||
@@ -239,7 +239,7 @@ class AwqTest(unittest.TestCase):
|
||||
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto")
|
||||
|
||||
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1, 2, 3})
|
||||
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=40)
|
||||
|
||||
@@ -272,8 +272,8 @@ class AwqFusedTest(unittest.TestCase):
|
||||
model_name = "TheBloke/Mistral-7B-OpenOrca-AWQ"
|
||||
model_revision = "7048b2af77d0dd1c81b000b19d73f9cc8950b510"
|
||||
|
||||
custom_mapping_model_id = "TheBloke/Yi-34B-AWQ"
|
||||
custom_model_revision = "f1b2cd1b7459ceecfdc1fac5bb8725f13707c589"
|
||||
custom_mapping_model_id = "TheBloke/Mistral-7B-v0.1-AWQ"
|
||||
custom_model_revision = "f186bcfa9edbe2a4334262ec1e67f23e53ed1ae7"
|
||||
|
||||
mixtral_model_name = "casperhansen/mixtral-instruct-awq"
|
||||
mixtral_model_revision = "87dd4ec502dde74fb3a624835c776b000d190c3b"
|
||||
@@ -287,8 +287,8 @@ class AwqFusedTest(unittest.TestCase):
|
||||
"You end up exactly where you started. Where are you?"
|
||||
)
|
||||
|
||||
EXPECTED_GENERATION = prompt + "\n\nThis is a classic puzzle that has been around for"
|
||||
EXPECTED_GENERATION_CUSTOM_MODEL = "HelloWorld.java:11)\r\n\tat org"
|
||||
EXPECTED_GENERATION = prompt + "\n\nYou are at the starting point.\n\nIf"
|
||||
EXPECTED_GENERATION_CUSTOM_MODEL = "Hello,\n\nI have a problem with my 20"
|
||||
EXPECTED_GENERATION_MIXTRAL = prompt + " You're on the North Pole.\n\nThe"
|
||||
|
||||
def tearDown(self):
|
||||
@@ -423,28 +423,25 @@ class AwqFusedTest(unittest.TestCase):
|
||||
fuse_max_seq_len=512,
|
||||
modules_to_fuse={
|
||||
"attention": ["q_proj", "k_proj", "v_proj", "o_proj"],
|
||||
"layernorm": ["ln1", "ln2", "norm"],
|
||||
"mlp": ["gate_proj", "up_proj", "down_proj"],
|
||||
"layernorm": ["input_layernorm", "post_attention_layernorm", "norm"],
|
||||
"use_alibi": False,
|
||||
"num_attention_heads": 56,
|
||||
"hidden_size": 4096,
|
||||
"num_attention_heads": 32,
|
||||
"num_key_value_heads": 8,
|
||||
"hidden_size": 7168,
|
||||
},
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
self.custom_mapping_model_id,
|
||||
quantization_config=quantization_config,
|
||||
trust_remote_code=True,
|
||||
device_map="balanced",
|
||||
revision=self.custom_model_revision,
|
||||
)
|
||||
|
||||
self._check_fused_modules(model)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
self.custom_mapping_model_id, revision=self.custom_model_revision, trust_remote_code=True
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.custom_mapping_model_id, revision=self.custom_model_revision)
|
||||
|
||||
prompt = "Hello"
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to(torch_device)
|
||||
@@ -452,6 +449,7 @@ class AwqFusedTest(unittest.TestCase):
|
||||
outputs = model.generate(**inputs, max_new_tokens=12)
|
||||
self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION_CUSTOM_MODEL)
|
||||
|
||||
@unittest.skip("Not enough GPU memory on CI runners")
|
||||
@require_torch_multi_gpu
|
||||
def test_generation_mixtral_fused(self):
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user