Tests: remove cuda versions when the result is the same 🧹🧹 (#31955)
remove cuda versions when the result is the same
This commit is contained in:
@@ -566,24 +566,10 @@ class GemmaIntegrationTest(unittest.TestCase):
|
||||
def test_model_2b_bf16(self):
|
||||
model_id = "google/gemma-2b"
|
||||
|
||||
# Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
|
||||
#
|
||||
# Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
|
||||
# considering differences in hardware processing and potential deviations in generated text.
|
||||
EXPECTED_TEXTS = {
|
||||
7: [
|
||||
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
|
||||
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
|
||||
],
|
||||
8: [
|
||||
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
|
||||
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
|
||||
],
|
||||
9: [
|
||||
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
|
||||
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
|
||||
],
|
||||
}
|
||||
EXPECTED_TEXTS = [
|
||||
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
|
||||
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
|
||||
]
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
|
||||
torch_device
|
||||
@@ -595,30 +581,16 @@ class GemmaIntegrationTest(unittest.TestCase):
|
||||
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
||||
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
|
||||
|
||||
self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version])
|
||||
self.assertEqual(output_text, EXPECTED_TEXTS)
|
||||
|
||||
@require_read_token
|
||||
def test_model_2b_eager(self):
|
||||
model_id = "google/gemma-2b"
|
||||
|
||||
# Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
|
||||
#
|
||||
# Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
|
||||
# considering differences in hardware processing and potential deviations in generated text.
|
||||
EXPECTED_TEXTS = {
|
||||
7: [
|
||||
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
|
||||
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
|
||||
],
|
||||
8: [
|
||||
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
|
||||
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
|
||||
],
|
||||
9: [
|
||||
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
|
||||
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
|
||||
],
|
||||
}
|
||||
EXPECTED_TEXTS = [
|
||||
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
|
||||
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
|
||||
]
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="eager"
|
||||
@@ -631,31 +603,17 @@ class GemmaIntegrationTest(unittest.TestCase):
|
||||
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
||||
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
|
||||
|
||||
self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version])
|
||||
self.assertEqual(output_text, EXPECTED_TEXTS)
|
||||
|
||||
@require_torch_sdpa
|
||||
@require_read_token
|
||||
def test_model_2b_sdpa(self):
|
||||
model_id = "google/gemma-2b"
|
||||
|
||||
# Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
|
||||
#
|
||||
# Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
|
||||
# considering differences in hardware processing and potential deviations in generated text.
|
||||
EXPECTED_TEXTS = {
|
||||
7: [
|
||||
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
|
||||
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
|
||||
],
|
||||
8: [
|
||||
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
|
||||
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
|
||||
],
|
||||
9: [
|
||||
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
|
||||
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
|
||||
],
|
||||
}
|
||||
EXPECTED_TEXTS = [
|
||||
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
|
||||
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
|
||||
]
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="sdpa"
|
||||
@@ -668,7 +626,7 @@ class GemmaIntegrationTest(unittest.TestCase):
|
||||
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
||||
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
|
||||
|
||||
self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version])
|
||||
self.assertEqual(output_text, EXPECTED_TEXTS)
|
||||
|
||||
@pytest.mark.flash_attn_test
|
||||
@require_flash_attn
|
||||
@@ -734,7 +692,7 @@ class GemmaIntegrationTest(unittest.TestCase):
|
||||
@require_read_token
|
||||
def test_model_7b_fp16(self):
|
||||
if self.cuda_compute_capability_major_version == 7:
|
||||
self.skipTest("This test is failing (`torch.compile` fails) on Nvidia T4 GPU.")
|
||||
self.skipTest("This test is failing (`torch.compile` fails) on Nvidia T4 GPU (OOM).")
|
||||
|
||||
model_id = "google/gemma-7b"
|
||||
EXPECTED_TEXTS = [
|
||||
@@ -757,7 +715,7 @@ class GemmaIntegrationTest(unittest.TestCase):
|
||||
@require_read_token
|
||||
def test_model_7b_bf16(self):
|
||||
if self.cuda_compute_capability_major_version == 7:
|
||||
self.skipTest("This test is failing (`torch.compile` fails) on Nvidia T4 GPU.")
|
||||
self.skipTest("This test is failing (`torch.compile` fails) on Nvidia T4 GPU (OOM).")
|
||||
|
||||
model_id = "google/gemma-7b"
|
||||
|
||||
@@ -795,7 +753,7 @@ class GemmaIntegrationTest(unittest.TestCase):
|
||||
@require_read_token
|
||||
def test_model_7b_fp16_static_cache(self):
|
||||
if self.cuda_compute_capability_major_version == 7:
|
||||
self.skipTest("This test is failing (`torch.compile` fails) on Nvidia T4 GPU.")
|
||||
self.skipTest("This test is failing (`torch.compile` fails) on Nvidia T4 GPU (OOM).")
|
||||
|
||||
model_id = "google/gemma-7b"
|
||||
EXPECTED_TEXTS = [
|
||||
@@ -821,16 +779,10 @@ class GemmaIntegrationTest(unittest.TestCase):
|
||||
@require_read_token
|
||||
def test_model_7b_4bit(self):
|
||||
model_id = "google/gemma-7b"
|
||||
EXPECTED_TEXTS = {
|
||||
7: [
|
||||
"Hello I am doing a project for my school and I am trying to make a program that will take a number and then",
|
||||
"Hi today I am going to talk about the best way to get rid of acne. miniaturing is a very",
|
||||
],
|
||||
8: [
|
||||
"Hello I am doing a project for my school and I am trying to make a program that will take a number and then",
|
||||
"Hi today I am going to talk about the best way to get rid of acne. miniaturing is a very",
|
||||
],
|
||||
}
|
||||
EXPECTED_TEXTS = [
|
||||
"Hello I am doing a project for my school and I am trying to make a program that will take a number and then",
|
||||
"Hi today I am going to talk about the best way to get rid of acne. miniaturing is a very",
|
||||
]
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, load_in_4bit=True)
|
||||
|
||||
@@ -839,7 +791,7 @@ class GemmaIntegrationTest(unittest.TestCase):
|
||||
|
||||
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
||||
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
|
||||
self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version])
|
||||
self.assertEqual(output_text, EXPECTED_TEXTS)
|
||||
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
@@ -851,27 +803,10 @@ class GemmaIntegrationTest(unittest.TestCase):
|
||||
self.skipTest(reason="This test requires torch >= 2.3 to run.")
|
||||
|
||||
NUM_TOKENS_TO_GENERATE = 40
|
||||
# Note on `EXPECTED_TEXT_COMPLETION`'s diff: the current value matches the original test if the original test
|
||||
# was changed to have a cache of 53 tokens (as opposed to 4096), on Ampere GPUs.
|
||||
#
|
||||
# Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
|
||||
#
|
||||
# Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
|
||||
# considering differences in hardware processing and potential deviations in generated text.
|
||||
EXPECTED_TEXT_COMPLETION = {
|
||||
8: [
|
||||
"Hello I am doing a project on the 1990s and I need to know what the most popular music was in the 1990s. I have looked on the internet and I have found",
|
||||
"Hi today\nI have a problem with my 2007 1.9 tdi 105bhp.\nI have a problem with the engine management light on.\nI have checked the",
|
||||
],
|
||||
7: [
|
||||
"Hello I am doing a project on the 1990s and I need to know what the most popular music was in the 1990s. I have looked on the internet and I have found",
|
||||
"Hi today\nI have a problem with my 2007 1.9 tdi 105bhp.\nI have a problem with the engine management light on.\nI have checked the",
|
||||
],
|
||||
9: [
|
||||
"Hello I am doing a project on the 1990s and I need to know what the most popular music was in the 1990s. I have looked on the internet and I have found",
|
||||
"Hi today\nI have a problem with my 2007 1.9 tdi 105bhp.\nI have a problem with the engine management light on.\nI have checked the",
|
||||
],
|
||||
}
|
||||
EXPECTED_TEXT_COMPLETION = [
|
||||
"Hello I am doing a project on the 1990s and I need to know what the most popular music was in the 1990s. I have looked on the internet and I have found",
|
||||
"Hi today\nI have a problem with my 2007 1.9 tdi 105bhp.\nI have a problem with the engine management light on.\nI have checked the",
|
||||
]
|
||||
|
||||
prompts = ["Hello I am doing", "Hi today"]
|
||||
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b", pad_token="</s>", padding_side="right")
|
||||
@@ -888,7 +823,7 @@ class GemmaIntegrationTest(unittest.TestCase):
|
||||
**inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static"
|
||||
)
|
||||
static_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
|
||||
self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], static_text)
|
||||
self.assertEqual(EXPECTED_TEXT_COMPLETION, static_text)
|
||||
|
||||
# Static Cache + compile
|
||||
model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
|
||||
@@ -896,7 +831,7 @@ class GemmaIntegrationTest(unittest.TestCase):
|
||||
**inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static"
|
||||
)
|
||||
static_compiled_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
|
||||
self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], static_compiled_text)
|
||||
self.assertEqual(EXPECTED_TEXT_COMPLETION, static_compiled_text)
|
||||
|
||||
def test_model_2b_bf16_dola(self):
|
||||
model_id = "google/gemma-2b"
|
||||
|
||||
Reference in New Issue
Block a user