Tests: remove cuda versions when the result is the same 🧹🧹 (#31955)

remove cuda versions when the result is the same
2024-07-16 16:49:54 +01:00
parent 693cb828ff
commit 999981daf4
3 changed files with 52 additions and 155 deletions
--- a/tests/models/gemma/test_modeling_gemma.py
+++ b/tests/models/gemma/test_modeling_gemma.py
@@ -566,24 +566,10 @@ class GemmaIntegrationTest(unittest.TestCase):
    def test_model_2b_bf16(self):
        model_id = "google/gemma-2b"

-        # Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
-        #
-        # Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
-        # considering differences in hardware processing and potential deviations in generated text.
-        EXPECTED_TEXTS = {
-            7: [
-                "Hello I am doing a project on the 1990s and I need to know what the most popular music",
-                "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
-            ],
-            8: [
-                "Hello I am doing a project on the 1990s and I need to know what the most popular music",
-                "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
-            ],
-            9: [
-                "Hello I am doing a project on the 1990s and I need to know what the most popular music",
-                "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
-            ],
-        }
+        EXPECTED_TEXTS = [
+            "Hello I am doing a project on the 1990s and I need to know what the most popular music",
+            "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
+        ]

        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
            torch_device
@@ -595,30 +581,16 @@ class GemmaIntegrationTest(unittest.TestCase):
        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)

-        self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version])
+        self.assertEqual(output_text, EXPECTED_TEXTS)

    @require_read_token
    def test_model_2b_eager(self):
        model_id = "google/gemma-2b"

-        # Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
-        #
-        # Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
-        # considering differences in hardware processing and potential deviations in generated text.
-        EXPECTED_TEXTS = {
-            7: [
-                "Hello I am doing a project on the 1990s and I need to know what the most popular music",
-                "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
-            ],
-            8: [
-                "Hello I am doing a project on the 1990s and I need to know what the most popular music",
-                "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
-            ],
-            9: [
-                "Hello I am doing a project on the 1990s and I need to know what the most popular music",
-                "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
-            ],
-        }
+        EXPECTED_TEXTS = [
+            "Hello I am doing a project on the 1990s and I need to know what the most popular music",
+            "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
+        ]

        model = AutoModelForCausalLM.from_pretrained(
            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="eager"
@@ -631,31 +603,17 @@ class GemmaIntegrationTest(unittest.TestCase):
        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)

-        self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version])
+        self.assertEqual(output_text, EXPECTED_TEXTS)

    @require_torch_sdpa
    @require_read_token
    def test_model_2b_sdpa(self):
        model_id = "google/gemma-2b"

-        # Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
-        #
-        # Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
-        # considering differences in hardware processing and potential deviations in generated text.
-        EXPECTED_TEXTS = {
-            7: [
-                "Hello I am doing a project on the 1990s and I need to know what the most popular music",
-                "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
-            ],
-            8: [
-                "Hello I am doing a project on the 1990s and I need to know what the most popular music",
-                "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
-            ],
-            9: [
-                "Hello I am doing a project on the 1990s and I need to know what the most popular music",
-                "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
-            ],
-        }
+        EXPECTED_TEXTS = [
+            "Hello I am doing a project on the 1990s and I need to know what the most popular music",
+            "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
+        ]

        model = AutoModelForCausalLM.from_pretrained(
            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="sdpa"
@@ -668,7 +626,7 @@ class GemmaIntegrationTest(unittest.TestCase):
        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)

-        self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version])
+        self.assertEqual(output_text, EXPECTED_TEXTS)

    @pytest.mark.flash_attn_test
    @require_flash_attn
@@ -734,7 +692,7 @@ class GemmaIntegrationTest(unittest.TestCase):
    @require_read_token
    def test_model_7b_fp16(self):
        if self.cuda_compute_capability_major_version == 7:
-            self.skipTest("This test is failing (`torch.compile` fails) on Nvidia T4 GPU.")
+            self.skipTest("This test is failing (`torch.compile` fails) on Nvidia T4 GPU (OOM).")

        model_id = "google/gemma-7b"
        EXPECTED_TEXTS = [
@@ -757,7 +715,7 @@ class GemmaIntegrationTest(unittest.TestCase):
    @require_read_token
    def test_model_7b_bf16(self):
        if self.cuda_compute_capability_major_version == 7:
-            self.skipTest("This test is failing (`torch.compile` fails) on Nvidia T4 GPU.")
+            self.skipTest("This test is failing (`torch.compile` fails) on Nvidia T4 GPU (OOM).")

        model_id = "google/gemma-7b"

@@ -795,7 +753,7 @@ class GemmaIntegrationTest(unittest.TestCase):
    @require_read_token
    def test_model_7b_fp16_static_cache(self):
        if self.cuda_compute_capability_major_version == 7:
-            self.skipTest("This test is failing (`torch.compile` fails) on Nvidia T4 GPU.")
+            self.skipTest("This test is failing (`torch.compile` fails) on Nvidia T4 GPU (OOM).")

        model_id = "google/gemma-7b"
        EXPECTED_TEXTS = [
@@ -821,16 +779,10 @@ class GemmaIntegrationTest(unittest.TestCase):
    @require_read_token
    def test_model_7b_4bit(self):
        model_id = "google/gemma-7b"
-        EXPECTED_TEXTS = {
-            7: [
-                "Hello I am doing a project for my school and I am trying to make a program that will take a number and then",
-                "Hi today I am going to talk about the best way to get rid of acne. miniaturing is a very",
-            ],
-            8: [
-                "Hello I am doing a project for my school and I am trying to make a program that will take a number and then",
-                "Hi today I am going to talk about the best way to get rid of acne. miniaturing is a very",
-            ],
-        }
+        EXPECTED_TEXTS = [
+            "Hello I am doing a project for my school and I am trying to make a program that will take a number and then",
+            "Hi today I am going to talk about the best way to get rid of acne. miniaturing is a very",
+        ]

        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, load_in_4bit=True)

@@ -839,7 +791,7 @@ class GemmaIntegrationTest(unittest.TestCase):

        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
-        self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version])
+        self.assertEqual(output_text, EXPECTED_TEXTS)

    @slow
    @require_torch_gpu
@@ -851,27 +803,10 @@ class GemmaIntegrationTest(unittest.TestCase):
            self.skipTest(reason="This test requires torch >= 2.3 to run.")

        NUM_TOKENS_TO_GENERATE = 40
-        # Note on `EXPECTED_TEXT_COMPLETION`'s diff: the current value matches the original test if the original test
-        # was changed to have a cache of 53 tokens (as opposed to 4096), on Ampere GPUs.
-        #
-        # Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
-        #
-        # Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
-        # considering differences in hardware processing and potential deviations in generated text.
-        EXPECTED_TEXT_COMPLETION = {
-            8: [
-                "Hello I am doing a project on the 1990s and I need to know what the most popular music was in the 1990s. I have looked on the internet and I have found",
-                "Hi today\nI have a problem with my 2007 1.9 tdi 105bhp.\nI have a problem with the engine management light on.\nI have checked the",
-            ],
-            7: [
-                "Hello I am doing a project on the 1990s and I need to know what the most popular music was in the 1990s. I have looked on the internet and I have found",
-                "Hi today\nI have a problem with my 2007 1.9 tdi 105bhp.\nI have a problem with the engine management light on.\nI have checked the",
-            ],
-            9: [
-                "Hello I am doing a project on the 1990s and I need to know what the most popular music was in the 1990s. I have looked on the internet and I have found",
-                "Hi today\nI have a problem with my 2007 1.9 tdi 105bhp.\nI have a problem with the engine management light on.\nI have checked the",
-            ],
-        }
+        EXPECTED_TEXT_COMPLETION = [
+            "Hello I am doing a project on the 1990s and I need to know what the most popular music was in the 1990s. I have looked on the internet and I have found",
+            "Hi today\nI have a problem with my 2007 1.9 tdi 105bhp.\nI have a problem with the engine management light on.\nI have checked the",
+        ]

        prompts = ["Hello I am doing", "Hi today"]
        tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b", pad_token="</s>", padding_side="right")
@@ -888,7 +823,7 @@ class GemmaIntegrationTest(unittest.TestCase):
            **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static"
        )
        static_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-        self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], static_text)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, static_text)

        # Static Cache + compile
        model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
@@ -896,7 +831,7 @@ class GemmaIntegrationTest(unittest.TestCase):
            **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static"
        )
        static_compiled_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-        self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], static_compiled_text)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, static_compiled_text)

    def test_model_2b_bf16_dola(self):
        model_id = "google/gemma-2b"