From 38c406844e3250168ed852e3b7141793656a0411 Mon Sep 17 00:00:00 2001 From: Mohamed Mekkouri <93391238+MekkCyber@users.noreply.github.com> Date: Tue, 22 Apr 2025 13:59:57 +0200 Subject: [PATCH] Fixing quantization tests (#37650) * fix * style * add capability check --- tests/quantization/autoawq/test_awq.py | 16 ++++++++++++++-- tests/quantization/ggml/test_ggml.py | 1 + .../quanto_integration/test_quanto.py | 4 ++-- .../quantization/quark_integration/test_quark.py | 1 + 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/tests/quantization/autoawq/test_awq.py b/tests/quantization/autoawq/test_awq.py index d234dd408a..055f736e12 100644 --- a/tests/quantization/autoawq/test_awq.py +++ b/tests/quantization/autoawq/test_awq.py @@ -110,7 +110,7 @@ class AwqTest(unittest.TestCase): input_text = "Hello my name is" EXPECTED_OUTPUT = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Journalism and minoring in Spanish" - EXPECTED_OUTPUT_BF16 = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Exercise and Sport Science with a" + EXPECTED_OUTPUT_BF16 = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Journalism and minoring in Spanish" EXPECTED_OUTPUT_EXLLAMA = [ "Hello my name is Katie and I am a 20 year old student from the UK. I am currently studying for a degree in English Literature and History at the University of York. I am a very out", @@ -299,7 +299,7 @@ class AwqFusedTest(unittest.TestCase): "You end up exactly where you started. Where are you?" ) - EXPECTED_GENERATION = prompt + "\n\nThis is a classic puzzle that has been around for" + EXPECTED_GENERATION = prompt + "\n\nYou're at the center of a square." EXPECTED_GENERATION_CUSTOM_MODEL = "Hello,\n\nI have a problem with my 20" EXPECTED_GENERATION_MIXTRAL = prompt + " You're on the North Pole.\n\nThe" @@ -355,6 +355,10 @@ class AwqFusedTest(unittest.TestCase): # Checks if the modules_to_not_convert (here gate layer) is a Linear self.assertTrue(isinstance(model.model.layers[0].block_sparse_moe.gate, torch.nn.Linear)) + @unittest.skipIf( + torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 8, + "Skipping because RuntimeError: FlashAttention only supports Ampere GPUs or newer, so not supported on GPU with capability < 8.0", + ) def test_generation_fused(self): """ Test generation quality for fused models - single batch case @@ -378,6 +382,10 @@ class AwqFusedTest(unittest.TestCase): self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION) + @unittest.skipIf( + torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 8, + "Skipping because RuntimeError: FlashAttention only supports Ampere GPUs or newer, so not supported on GPU with capability < 8.0", + ) def test_generation_fused_batched(self): """ Test generation quality for fused models - multi batch case @@ -426,6 +434,10 @@ class AwqFusedTest(unittest.TestCase): self.assertEqual(outputs[0]["generated_text"], EXPECTED_OUTPUT) @require_torch_multi_gpu + @unittest.skipIf( + torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 8, + "Skipping because RuntimeError: FlashAttention only supports Ampere GPUs or newer, so not supported on GPU with capability < 8.0", + ) def test_generation_custom_model(self): """ Test generation quality for fused models using custom fused map. diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py index 52c700b16f..b80de8d45d 100644 --- a/tests/quantization/ggml/test_ggml.py +++ b/tests/quantization/ggml/test_ggml.py @@ -904,6 +904,7 @@ class GgufModelTests(unittest.TestCase): out = model.generate(text, max_new_tokens=10) EXPECTED_TEXT = 'Hello with the prompt, "What is the best way' + self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) @require_read_token diff --git a/tests/quantization/quanto_integration/test_quanto.py b/tests/quantization/quanto_integration/test_quanto.py index 50685137cb..766faafbbf 100644 --- a/tests/quantization/quanto_integration/test_quanto.py +++ b/tests/quantization/quanto_integration/test_quanto.py @@ -30,7 +30,7 @@ from transformers.utils import is_accelerate_available, is_optimum_quanto_availa if is_torch_available(): import torch - from transformers import LlamaForCausalLM, LlamaTokenizer + from transformers import LlamaForCausalLM if is_accelerate_available(): from accelerate import init_empty_weights @@ -455,7 +455,7 @@ class QuantoKVCacheQuantizationTest(unittest.TestCase): "Simply put, the theory of relativity states that ", "My favorite all time favorite condiment is ketchup.", ] - tokenizer = LlamaTokenizer.from_pretrained( + tokenizer = AutoTokenizer.from_pretrained( "unsloth/Llama-3.2-1B-Instruct", pad_token="", padding_side="left" ) model = LlamaForCausalLM.from_pretrained( diff --git a/tests/quantization/quark_integration/test_quark.py b/tests/quantization/quark_integration/test_quark.py index 4e2c964d56..22d0eb5293 100644 --- a/tests/quantization/quark_integration/test_quark.py +++ b/tests/quantization/quark_integration/test_quark.py @@ -54,6 +54,7 @@ class QuarkTest(unittest.TestCase): EXPECTED_OUTPUTS.add("Today I am in Paris and I am enjoying the city of light. I am not just any ordinary Paris") EXPECTED_OUTPUTS.add("Today I am in Paris and I am enjoying my day off! The sun is shining, the birds are") EXPECTED_OUTPUTS.add("Today I am in Paris and I'm here to tell you about it. It's a beautiful day,") + EXPECTED_OUTPUTS.add("Today I am in Paris and I am not in Paris at all! I am not in Paris, but") EXPECTED_RELATIVE_DIFFERENCE = 1.66 device_map = None