From 9981214d32c9d0777e1d67cd68d4dd4009712e75 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Tue, 6 May 2025 11:15:25 +0100
Subject: [PATCH] [tests] Smaller model in slow cache tests (#37922)

---
 tests/utils/test_cache_utils.py | 98 +++++++++++++++++++--------------
 1 file changed, 58 insertions(+), 40 deletions(-)

diff --git a/tests/utils/test_cache_utils.py b/tests/utils/test_cache_utils.py
index 980f57aa34..691c5aa535 100644
--- a/tests/utils/test_cache_utils.py
+++ b/tests/utils/test_cache_utils.py
@@ -24,6 +24,7 @@ from transformers.testing_utils import (
     cleanup,
     get_gpu_count,
     is_torch_available,
+    require_read_token,
     require_torch,
     require_torch_accelerator,
     require_torch_gpu,
@@ -301,37 +302,52 @@ class CacheIntegrationTest(unittest.TestCase):
 class CacheHardIntegrationTest(unittest.TestCase):
     """Hard cache integration tests that require loading different models"""
 
-    def tearDown(self):
-        # Some tests use large models, which might result in suboptimal torch re-allocation if we run multiple tests
-        # in a row
+    def setUp(self):
+        # Clears memory before each test. Some tests use large models, which might result in suboptimal torch
+        # re-allocation if we run multiple tests in a row without clearing memory.
+        cleanup(torch_device, gc_collect=True)
+
+    @classmethod
+    def tearDownClass(cls):
+        # Clears memory after the last test. See `setUp` for more details.
         cleanup(torch_device, gc_collect=True)
 
     @slow
     def test_dynamic_cache_hard(self):
         """Hard test for base cache implementation -- minor numerical fluctuations will cause this test to fail"""
-        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", padding_side="left")
-        model = AutoModelForCausalLM.from_pretrained(
-            "meta-llama/Llama-2-7b-hf", device_map="auto", torch_dtype=torch.float16
-        )
+        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B", padding_side="left")
+        model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-4B", device_map="auto", torch_dtype=torch.bfloat16)
         inputs = tokenizer(["Here's everything I know about cats. Cats"], return_tensors="pt").to(model.device)
 
         set_seed(0)
-        gen_out = model.generate(**inputs, do_sample=True, max_new_tokens=256)
-
-        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
-        expected_text = (
-            "Here's everything I know about cats. Cats are mysterious creatures. They can't talk, and they don't like "
-            "to be held. They don't play fetch, and they don't like to be hugged. But they do like to be petted.\n"
-            "Cats are also very independent. They don't like to be told what to do, and they don't like to be told "
-            "what to eat. They are also very territorial. They don't like to share their food or their toys.\nCats "
-            "are also very curious. They like to explore, and they like to play. They are also very fast. They can "
-            "run very fast, and they can jump very high.\nCats are also very smart. They can learn tricks, and they "
-            "can solve problems. They are also very playful. They like to play with toys, and they like to play with "
-            "other cats.\nCats are also very affectionate. They like to be petted, and they like to be held. They "
-            "also like to be scratched.\nCats are also very clean. They like to groom themselves, and they like to "
-            "clean their litter box.\nCats are also very independent. They don't"
+        gen_out = model.generate(
+            **inputs, do_sample=True, max_new_tokens=256, return_dict_in_generate=True, output_scores=True
         )
-        self.assertEqual(decoded[0], expected_text)
+        decoded = tokenizer.batch_decode(gen_out.sequences, skip_special_tokens=True)
+        # sum of the scores for the generated tokens
+        input_length = inputs.input_ids.shape[1]
+        score_sum = sum(
+            [score[0][gen_out.sequences[0][input_length + idx]] for idx, score in enumerate(gen_out.scores)]
+        )
+
+        EXPECTED_GENERATION = (
+            "Here's everything I know about cats. Cats are mammals, they have four legs, they have a tail, they have "
+            "a face with a nose, eyes, and mouth. They have fur, they have claws, and they have a body that is "
+            "covered in fur. They are carnivores, so they eat meat. They are also very clean animals, they groom "
+            "themselves. They have a lot of different breeds. Some are small, some are large. Some are friendly, "
+            "some are not. They have a lot of different personalities. They can be very independent, or they can be "
+            "very affectionate. They can be very playful, or they can be very lazy. They can be very intelligent, or "
+            "they can be very silly. They have a lot of different behaviors. They can be very curious, or they can "
+            "be very cautious. They can be very vocal, or they can be very quiet. They can be very social, or they "
+            "can be very solitary. They can be very active, or they can be very inactive. They can be very "
+            "affectionate, or they can be very aloof. They can be very playful, or they can be very lazy. They can "
+            "be very intelligent, or they can be very silly. They have a lot of different behaviors. They can be "
+            "very curious, or they can"
+        )
+        EXPECTED_SCORE_SUM = 11017.4971
+        self.assertEqual(decoded[0], EXPECTED_GENERATION)
+        self.assertAlmostEqual(score_sum, EXPECTED_SCORE_SUM, places=2)
+        self.assertIsInstance(gen_out.past_key_values, DynamicCache)  # sanity check
 
     @parameterized.expand([("eager"), ("sdpa")])
     @require_torch_gpu
@@ -339,41 +355,42 @@ class CacheHardIntegrationTest(unittest.TestCase):
     def test_static_cache_greedy_decoding_pad_left(self, attn_implementation):
         """Tests that different cache implementations work well with eager and SDPA inference"""
         EXPECTED_GENERATION = [
-            "The best color is the one that complements the skin tone of the",
-            "We should not undermind the issues at hand.\nWe should not undermind the issues",
+            "The best color is the one that is most suitable for the purpose.",
+            "We should not undermind the issues at hand, but instead, we should focus on the things",
         ]
 
-        tokenizer = AutoTokenizer.from_pretrained(
-            "NousResearch/Llama-2-7b-chat-hf", padding_side="left", pad_token="<s>"
-        )
+        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B", padding_side="left")
         model = AutoModelForCausalLM.from_pretrained(
-            "NousResearch/Llama-2-7b-chat-hf",
+            "Qwen/Qwen3-4B",
             torch_dtype=torch.bfloat16,
             attn_implementation=attn_implementation,
-        ).to(torch_device)
+            device_map="auto",
+        )
         inputs = tokenizer(
             ["The best color is", "We should not undermind the issues at hand"], padding=True, return_tensors="pt"
         ).to(model.device)
+        generation_kwargs = {"do_sample": False, "max_new_tokens": 10, "return_dict_in_generate": True}
 
         set_seed(0)
-        gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10)
-        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
+        gen_out = model.generate(**inputs, **generation_kwargs)
+        decoded = tokenizer.batch_decode(gen_out.sequences, skip_special_tokens=True)
         with self.subTest(f"{attn_implementation}, dynamic"):
             self.assertListEqual(decoded, EXPECTED_GENERATION)
+            self.assertIsInstance(gen_out.past_key_values, DynamicCache)  # sanity check
 
         set_seed(0)
-        gen_out = model.generate(
-            **inputs, do_sample=False, max_new_tokens=10, cache_implementation="static", disable_compile=True
-        )
-        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
+        gen_out = model.generate(**inputs, **generation_kwargs, cache_implementation="static", disable_compile=True)
+        decoded = tokenizer.batch_decode(gen_out.sequences, skip_special_tokens=True)
         with self.subTest(f"{attn_implementation}, static, eager"):
             self.assertListEqual(decoded, EXPECTED_GENERATION)
+            self.assertIsInstance(gen_out.past_key_values, StaticCache)  # sanity check
 
         set_seed(0)
-        gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10, cache_implementation="static")
-        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
+        gen_out = model.generate(**inputs, **generation_kwargs, cache_implementation="static")
+        decoded = tokenizer.batch_decode(gen_out.sequences, skip_special_tokens=True)
         with self.subTest(f"{attn_implementation}, static, compiled"):
             self.assertListEqual(decoded, EXPECTED_GENERATION)
+            self.assertIsInstance(gen_out.past_key_values, StaticCache)  # sanity check
 
     @require_torch_accelerator
     @slow
@@ -446,9 +463,9 @@ class CacheHardIntegrationTest(unittest.TestCase):
             responses.append(response)
 
         EXPECTED_DECODED_TEXT = [
-            "You are a helpful assistant. Help me to write a blogpost about travelling.\n\nTraveling is a wonderful "
-            "way to explore new places, cultures, and experiences. Whether you are a seasoned traveler or a "
-            "first-time adventurer, there is always something",
+            "You are a helpful assistant. Help me to write a blogpost about travelling.\n\nTraveling is an "
+            "enriching experience that broadens our horizons and allows us to explore the world beyond our comfort "
+            "zones. Whether it's a short weekend getaway",
             "You are a helpful assistant. What is the capital of France?\n\n\n## Response:Paris is the capital "
             "of France.\n\n\n\n\n\n\n<|endoftext|>",
         ]
@@ -506,6 +523,7 @@ class CacheHardIntegrationTest(unittest.TestCase):
 
     @require_torch_multi_gpu
     @slow
+    @require_read_token
     def test_static_cache_multi_gpu(self):
         """Regression test for #35164: static cache with multi-gpu"""