Tests: remove cuda versions when the result is the same 🧹🧹 (#31955)

remove cuda versions when the result is the same
This commit is contained in:
Joao Gante
2024-07-16 16:49:54 +01:00
committed by GitHub
parent 693cb828ff
commit 999981daf4
3 changed files with 52 additions and 155 deletions

View File

@@ -566,24 +566,10 @@ class GemmaIntegrationTest(unittest.TestCase):
def test_model_2b_bf16(self):
model_id = "google/gemma-2b"
# Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
#
# Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
# considering differences in hardware processing and potential deviations in generated text.
EXPECTED_TEXTS = {
7: [
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
],
8: [
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
],
9: [
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
],
}
EXPECTED_TEXTS = [
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
]
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
torch_device
@@ -595,30 +581,16 @@ class GemmaIntegrationTest(unittest.TestCase):
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version])
self.assertEqual(output_text, EXPECTED_TEXTS)
@require_read_token
def test_model_2b_eager(self):
model_id = "google/gemma-2b"
# Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
#
# Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
# considering differences in hardware processing and potential deviations in generated text.
EXPECTED_TEXTS = {
7: [
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
],
8: [
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
],
9: [
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
],
}
EXPECTED_TEXTS = [
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
]
model = AutoModelForCausalLM.from_pretrained(
model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="eager"
@@ -631,31 +603,17 @@ class GemmaIntegrationTest(unittest.TestCase):
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version])
self.assertEqual(output_text, EXPECTED_TEXTS)
@require_torch_sdpa
@require_read_token
def test_model_2b_sdpa(self):
model_id = "google/gemma-2b"
# Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
#
# Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
# considering differences in hardware processing and potential deviations in generated text.
EXPECTED_TEXTS = {
7: [
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
],
8: [
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
],
9: [
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
],
}
EXPECTED_TEXTS = [
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
]
model = AutoModelForCausalLM.from_pretrained(
model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="sdpa"
@@ -668,7 +626,7 @@ class GemmaIntegrationTest(unittest.TestCase):
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version])
self.assertEqual(output_text, EXPECTED_TEXTS)
@pytest.mark.flash_attn_test
@require_flash_attn
@@ -734,7 +692,7 @@ class GemmaIntegrationTest(unittest.TestCase):
@require_read_token
def test_model_7b_fp16(self):
if self.cuda_compute_capability_major_version == 7:
self.skipTest("This test is failing (`torch.compile` fails) on Nvidia T4 GPU.")
self.skipTest("This test is failing (`torch.compile` fails) on Nvidia T4 GPU (OOM).")
model_id = "google/gemma-7b"
EXPECTED_TEXTS = [
@@ -757,7 +715,7 @@ class GemmaIntegrationTest(unittest.TestCase):
@require_read_token
def test_model_7b_bf16(self):
if self.cuda_compute_capability_major_version == 7:
self.skipTest("This test is failing (`torch.compile` fails) on Nvidia T4 GPU.")
self.skipTest("This test is failing (`torch.compile` fails) on Nvidia T4 GPU (OOM).")
model_id = "google/gemma-7b"
@@ -795,7 +753,7 @@ class GemmaIntegrationTest(unittest.TestCase):
@require_read_token
def test_model_7b_fp16_static_cache(self):
if self.cuda_compute_capability_major_version == 7:
self.skipTest("This test is failing (`torch.compile` fails) on Nvidia T4 GPU.")
self.skipTest("This test is failing (`torch.compile` fails) on Nvidia T4 GPU (OOM).")
model_id = "google/gemma-7b"
EXPECTED_TEXTS = [
@@ -821,16 +779,10 @@ class GemmaIntegrationTest(unittest.TestCase):
@require_read_token
def test_model_7b_4bit(self):
model_id = "google/gemma-7b"
EXPECTED_TEXTS = {
7: [
"Hello I am doing a project for my school and I am trying to make a program that will take a number and then",
"Hi today I am going to talk about the best way to get rid of acne. miniaturing is a very",
],
8: [
"Hello I am doing a project for my school and I am trying to make a program that will take a number and then",
"Hi today I am going to talk about the best way to get rid of acne. miniaturing is a very",
],
}
EXPECTED_TEXTS = [
"Hello I am doing a project for my school and I am trying to make a program that will take a number and then",
"Hi today I am going to talk about the best way to get rid of acne. miniaturing is a very",
]
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, load_in_4bit=True)
@@ -839,7 +791,7 @@ class GemmaIntegrationTest(unittest.TestCase):
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version])
self.assertEqual(output_text, EXPECTED_TEXTS)
@slow
@require_torch_gpu
@@ -851,27 +803,10 @@ class GemmaIntegrationTest(unittest.TestCase):
self.skipTest(reason="This test requires torch >= 2.3 to run.")
NUM_TOKENS_TO_GENERATE = 40
# Note on `EXPECTED_TEXT_COMPLETION`'s diff: the current value matches the original test if the original test
# was changed to have a cache of 53 tokens (as opposed to 4096), on Ampere GPUs.
#
# Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
#
# Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
# considering differences in hardware processing and potential deviations in generated text.
EXPECTED_TEXT_COMPLETION = {
8: [
"Hello I am doing a project on the 1990s and I need to know what the most popular music was in the 1990s. I have looked on the internet and I have found",
"Hi today\nI have a problem with my 2007 1.9 tdi 105bhp.\nI have a problem with the engine management light on.\nI have checked the",
],
7: [
"Hello I am doing a project on the 1990s and I need to know what the most popular music was in the 1990s. I have looked on the internet and I have found",
"Hi today\nI have a problem with my 2007 1.9 tdi 105bhp.\nI have a problem with the engine management light on.\nI have checked the",
],
9: [
"Hello I am doing a project on the 1990s and I need to know what the most popular music was in the 1990s. I have looked on the internet and I have found",
"Hi today\nI have a problem with my 2007 1.9 tdi 105bhp.\nI have a problem with the engine management light on.\nI have checked the",
],
}
EXPECTED_TEXT_COMPLETION = [
"Hello I am doing a project on the 1990s and I need to know what the most popular music was in the 1990s. I have looked on the internet and I have found",
"Hi today\nI have a problem with my 2007 1.9 tdi 105bhp.\nI have a problem with the engine management light on.\nI have checked the",
]
prompts = ["Hello I am doing", "Hi today"]
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b", pad_token="</s>", padding_side="right")
@@ -888,7 +823,7 @@ class GemmaIntegrationTest(unittest.TestCase):
**inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static"
)
static_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], static_text)
self.assertEqual(EXPECTED_TEXT_COMPLETION, static_text)
# Static Cache + compile
model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
@@ -896,7 +831,7 @@ class GemmaIntegrationTest(unittest.TestCase):
**inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static"
)
static_compiled_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], static_compiled_text)
self.assertEqual(EXPECTED_TEXT_COMPLETION, static_compiled_text)
def test_model_2b_bf16_dola(self):
model_id = "google/gemma-2b"