[CI] green llama tests (#37244)

* green llama tests * use cleanup instead * better test comment; cleanup upgrade * better test comment; cleanup upgrade
2025-04-03 14:15:53 +01:00
parent 782d7d945d
commit 9a1c1fe7ed
15 changed files with 62 additions and 36 deletions
--- a/tests/models/deepseek_v3/test_modeling_deepseek_v3.py
+++ b/tests/models/deepseek_v3/test_modeling_deepseek_v3.py
@@ -21,6 +21,7 @@ from parameterized import parameterized

 from transformers import AutoTokenizer, DeepseekV3Config, is_torch_available, set_seed
 from transformers.testing_utils import (
+    cleanup,
    require_read_token,
    require_torch,
    require_torch_accelerator,
@@ -605,6 +606,10 @@ class DeepseekV3IntegrationTest(unittest.TestCase):
            # 8 is for A100 / A10 and 7 for T4
            cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]

+    def tearDown(self):
+        # See LlamaIntegrationTest.tearDown(). Can be removed once LlamaIntegrationTest.tearDown() is removed.
+        cleanup(torch_device, gc_collect=False)
+
    @slow
    @require_torch_accelerator
    @require_read_token
--- a/tests/models/diffllama/test_modeling_diffllama.py
+++ b/tests/models/diffllama/test_modeling_diffllama.py
@@ -25,6 +25,7 @@ from parameterized import parameterized
 from transformers import AutoTokenizer, DiffLlamaConfig, StaticCache, is_torch_available, set_seed
 from transformers.testing_utils import (
    backend_empty_cache,
+    cleanup,
    require_bitsandbytes,
    require_flash_attn,
    require_read_token,
@@ -685,6 +686,10 @@ class DiffLlamaIntegrationTest(unittest.TestCase):
            # 8 is for A100 / A10 and 7 for T4
            cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]

+    def tearDown(self):
+        # See LlamaIntegrationTest.tearDown(). Can be removed once LlamaIntegrationTest.tearDown() is removed.
+        cleanup(torch_device, gc_collect=False)
+
    @slow
    @require_torch_accelerator
    @require_read_token
@@ -884,7 +889,7 @@ class Mask4DTestHard(unittest.TestCase):
        max_cache_len = 16  # note that max_cache_len is greater than the attention_mask.shape[-1]
        past_key_values = StaticCache(
            config=self.model.config,
-            batch_size=1,
+            max_batch_size=1,
            max_cache_len=max_cache_len,
            device=torch_device,
            dtype=self.model.dtype,
@@ -932,7 +937,7 @@ class Mask4DTestHard(unittest.TestCase):
        max_cache_len = 16  # note that max_cache_len is greater than the attention_mask.shape[-1]
        past_key_values = StaticCache(
            config=self.model.config,
-            batch_size=1,
+            max_batch_size=1,
            max_cache_len=max_cache_len,
            device=torch_device,
            dtype=self.model.dtype,
--- a/tests/models/gemma/test_modeling_gemma.py
+++ b/tests/models/gemma/test_modeling_gemma.py
@@ -23,6 +23,7 @@ from packaging import version
 from transformers import AutoModelForCausalLM, AutoTokenizer, GemmaConfig, is_torch_available
 from transformers.generation.configuration_utils import GenerationConfig
 from transformers.testing_utils import (
+    cleanup,
    is_flaky,
    require_bitsandbytes,
    require_flash_attn,
@@ -498,6 +499,10 @@ class GemmaIntegrationTest(unittest.TestCase):
            # 8 is for A100 / A10 and 7 for T4
            cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]

+    def tearDown(self):
+        # See LlamaIntegrationTest.tearDown(). Can be removed once LlamaIntegrationTest.tearDown() is removed.
+        cleanup(torch_device, gc_collect=False)
+
    @require_read_token
    def test_model_2b_fp16(self):
        model_id = "google/gemma-2b"
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -549,6 +549,13 @@ class LlamaIntegrationTest(unittest.TestCase):
            # 8 is for A100 / A10 and 7 for T4
            cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]

+    def tearDown(self):
+        # TODO (joao): automatic compilation, i.e. compilation when `cache_implementation="static"` is used, leaves
+        # some memory allocated in the cache, which means some object is not being released properly. This causes some
+        # unoptimal memory usage, e.g. after certain tests a 7B model in FP16 no longer fits in a 24GB GPU.
+        # Investigate the root cause.
+        cleanup(torch_device, gc_collect=False)
+
    @slow
    @require_read_token
    def test_llama_3_1_hard(self):
@@ -748,14 +755,6 @@ class LlamaIntegrationTest(unittest.TestCase):
                "Simply put, the theory of relativity states that 1) the speed of light is the same for all "
                "observers, regardless of their location, and 2) the laws of physics are the same for all observers"
            ],
-            "meta-llama/Llama-3.2-3B": [
-                "Simply put, the theory of relativity states that 1. the speed of light is constant, and 2. "
-                "the speed of light is the fastest speed possible"
-            ],
-            "meta-llama/Llama-2-7b-hf": [
-                "Simply put, the theory of relativity states that 1) the speed of light is a constant, and 2) "
-                "the laws of physics are the same for all",
-            ],
        }

        for llama_model_ckp, EXPECTED_TEXT_COMPLETION in llama_models.items():
@@ -946,7 +945,7 @@ class Mask4DTestHard(unittest.TestCase):
        max_cache_len = 16  # note that max_cache_len is greater than the attention_mask.shape[-1]
        past_key_values = StaticCache(
            config=self.model.config,
-            batch_size=1,
+            max_batch_size=1,
            max_cache_len=max_cache_len,
            device=torch_device,
            dtype=self.model.dtype,
@@ -994,7 +993,7 @@ class Mask4DTestHard(unittest.TestCase):
        max_cache_len = 16  # note that max_cache_len is greater than the attention_mask.shape[-1]
        past_key_values = StaticCache(
            config=self.model.config,
-            batch_size=1,
+            max_batch_size=1,
            max_cache_len=max_cache_len,
            device=torch_device,
            dtype=self.model.dtype,
--- a/tests/models/phi3/test_modeling_phi3.py
+++ b/tests/models/phi3/test_modeling_phi3.py
@@ -53,7 +53,7 @@ if is_torch_available():
            self.model = model
            self.cache = StaticCache(
                config=model.config,
-                batch_size=batch_size,
+                max_batch_size=batch_size,
                max_cache_len=max_seq_len,
                device=self.model.device,
                dtype=self.model.dtype,
--- a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py
+++ b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py
@@ -227,10 +227,6 @@ class Phi4MultimodalModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.
    def test_flash_attn_2_inference_equivalence_right_padding(self):
        pass

-    @unittest.skip(reason="This one tries to use right padding as well")
-    def test_eager_matches_fa2_generate(self):
-        pass
-
    @unittest.skip(reason="Depending on input modalities, some params may not have gradients")
    def test_training_gradient_checkpointing(self):
        pass
--- a/tests/models/phimoe/test_modeling_phimoe.py
+++ b/tests/models/phimoe/test_modeling_phimoe.py
@@ -52,7 +52,7 @@ if is_torch_available():
            self.model = model
            self.cache = StaticCache(
                config=model.config,
-                batch_size=batch_size,
+                max_batch_size=batch_size,
                max_cache_len=max_seq_len,
                device=self.model.device,
                dtype=self.model.dtype,
--- a/tests/models/t5/test_modeling_t5.py
+++ b/tests/models/t5/test_modeling_t5.py
@@ -24,6 +24,7 @@ from transformers import T5Config, is_torch_available
 from transformers.models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
 from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_4
 from transformers.testing_utils import (
+    cleanup,
    require_accelerate,
    require_sentencepiece,
    require_tokenizers,
@@ -1170,6 +1171,10 @@ class T5ModelFp16Tests(unittest.TestCase):
@require_sentencepiece
@require_tokenizers
 class T5ModelIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # See LlamaIntegrationTest.tearDown(). Can be removed once LlamaIntegrationTest.tearDown() is removed.
+        cleanup(torch_device, gc_collect=False)
+
    @cached_property
    def model(self):
        return T5ForConditionalGeneration.from_pretrained("google-t5/t5-base").to(torch_device)