Fixing quantization tests (#37650)

* fix

* style

* add capability check
This commit is contained in:
Mohamed Mekkouri
2025-04-22 13:59:57 +02:00
committed by GitHub
parent b3492ff9f7
commit 38c406844e
4 changed files with 18 additions and 4 deletions

View File

@@ -110,7 +110,7 @@ class AwqTest(unittest.TestCase):
input_text = "Hello my name is" input_text = "Hello my name is"
EXPECTED_OUTPUT = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Journalism and minoring in Spanish" EXPECTED_OUTPUT = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Journalism and minoring in Spanish"
EXPECTED_OUTPUT_BF16 = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Exercise and Sport Science with a" EXPECTED_OUTPUT_BF16 = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Journalism and minoring in Spanish"
EXPECTED_OUTPUT_EXLLAMA = [ EXPECTED_OUTPUT_EXLLAMA = [
"Hello my name is Katie and I am a 20 year old student from the UK. I am currently studying for a degree in English Literature and History at the University of York. I am a very out", "Hello my name is Katie and I am a 20 year old student from the UK. I am currently studying for a degree in English Literature and History at the University of York. I am a very out",
@@ -299,7 +299,7 @@ class AwqFusedTest(unittest.TestCase):
"You end up exactly where you started. Where are you?" "You end up exactly where you started. Where are you?"
) )
EXPECTED_GENERATION = prompt + "\n\nThis is a classic puzzle that has been around for" EXPECTED_GENERATION = prompt + "\n\nYou're at the center of a square."
EXPECTED_GENERATION_CUSTOM_MODEL = "Hello,\n\nI have a problem with my 20" EXPECTED_GENERATION_CUSTOM_MODEL = "Hello,\n\nI have a problem with my 20"
EXPECTED_GENERATION_MIXTRAL = prompt + " You're on the North Pole.\n\nThe" EXPECTED_GENERATION_MIXTRAL = prompt + " You're on the North Pole.\n\nThe"
@@ -355,6 +355,10 @@ class AwqFusedTest(unittest.TestCase):
# Checks if the modules_to_not_convert (here gate layer) is a Linear # Checks if the modules_to_not_convert (here gate layer) is a Linear
self.assertTrue(isinstance(model.model.layers[0].block_sparse_moe.gate, torch.nn.Linear)) self.assertTrue(isinstance(model.model.layers[0].block_sparse_moe.gate, torch.nn.Linear))
@unittest.skipIf(
torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 8,
"Skipping because RuntimeError: FlashAttention only supports Ampere GPUs or newer, so not supported on GPU with capability < 8.0",
)
def test_generation_fused(self): def test_generation_fused(self):
""" """
Test generation quality for fused models - single batch case Test generation quality for fused models - single batch case
@@ -378,6 +382,10 @@ class AwqFusedTest(unittest.TestCase):
self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION) self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION)
@unittest.skipIf(
torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 8,
"Skipping because RuntimeError: FlashAttention only supports Ampere GPUs or newer, so not supported on GPU with capability < 8.0",
)
def test_generation_fused_batched(self): def test_generation_fused_batched(self):
""" """
Test generation quality for fused models - multi batch case Test generation quality for fused models - multi batch case
@@ -426,6 +434,10 @@ class AwqFusedTest(unittest.TestCase):
self.assertEqual(outputs[0]["generated_text"], EXPECTED_OUTPUT) self.assertEqual(outputs[0]["generated_text"], EXPECTED_OUTPUT)
@require_torch_multi_gpu @require_torch_multi_gpu
@unittest.skipIf(
torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 8,
"Skipping because RuntimeError: FlashAttention only supports Ampere GPUs or newer, so not supported on GPU with capability < 8.0",
)
def test_generation_custom_model(self): def test_generation_custom_model(self):
""" """
Test generation quality for fused models using custom fused map. Test generation quality for fused models using custom fused map.

View File

@@ -904,6 +904,7 @@ class GgufModelTests(unittest.TestCase):
out = model.generate(text, max_new_tokens=10) out = model.generate(text, max_new_tokens=10)
EXPECTED_TEXT = 'Hello with the prompt, "What is the best way' EXPECTED_TEXT = 'Hello with the prompt, "What is the best way'
self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
@require_read_token @require_read_token

View File

@@ -30,7 +30,7 @@ from transformers.utils import is_accelerate_available, is_optimum_quanto_availa
if is_torch_available(): if is_torch_available():
import torch import torch
from transformers import LlamaForCausalLM, LlamaTokenizer from transformers import LlamaForCausalLM
if is_accelerate_available(): if is_accelerate_available():
from accelerate import init_empty_weights from accelerate import init_empty_weights
@@ -455,7 +455,7 @@ class QuantoKVCacheQuantizationTest(unittest.TestCase):
"Simply put, the theory of relativity states that ", "Simply put, the theory of relativity states that ",
"My favorite all time favorite condiment is ketchup.", "My favorite all time favorite condiment is ketchup.",
] ]
tokenizer = LlamaTokenizer.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(
"unsloth/Llama-3.2-1B-Instruct", pad_token="</s>", padding_side="left" "unsloth/Llama-3.2-1B-Instruct", pad_token="</s>", padding_side="left"
) )
model = LlamaForCausalLM.from_pretrained( model = LlamaForCausalLM.from_pretrained(

View File

@@ -54,6 +54,7 @@ class QuarkTest(unittest.TestCase):
EXPECTED_OUTPUTS.add("Today I am in Paris and I am enjoying the city of light. I am not just any ordinary Paris") EXPECTED_OUTPUTS.add("Today I am in Paris and I am enjoying the city of light. I am not just any ordinary Paris")
EXPECTED_OUTPUTS.add("Today I am in Paris and I am enjoying my day off! The sun is shining, the birds are") EXPECTED_OUTPUTS.add("Today I am in Paris and I am enjoying my day off! The sun is shining, the birds are")
EXPECTED_OUTPUTS.add("Today I am in Paris and I'm here to tell you about it. It's a beautiful day,") EXPECTED_OUTPUTS.add("Today I am in Paris and I'm here to tell you about it. It's a beautiful day,")
EXPECTED_OUTPUTS.add("Today I am in Paris and I am not in Paris at all! I am not in Paris, but")
EXPECTED_RELATIVE_DIFFERENCE = 1.66 EXPECTED_RELATIVE_DIFFERENCE = 1.66
device_map = None device_map = None