[tests] reorganize cache tests and clean memory between tests (#37684)

2025-04-29 12:21:14 +01:00
parent 3a1acc36ed
commit 755b0fa2fe
1 changed files with 466 additions and 482 deletions
--- a/tests/utils/test_cache_utils.py
+++ b/tests/utils/test_cache_utils.py
@@ -20,6 +20,7 @@ from parameterized import parameterized
 from transformers import set_seed
 from transformers.testing_utils import (
    CaptureStderr,
    cleanup,
    get_gpu_count,
    is_torch_available,
    require_gptq,
@@ -53,6 +54,8 @@ if is_torch_available():
@require_torch
 class CacheTest(unittest.TestCase):
    """Cache tests that don't require loading models"""
    def test_dynamic_cache_retrocompatibility(self):
        """Tests that we can convert back and forth between the legacy cache format and DynamicCache"""
        legacy_cache = ()
@@ -173,6 +176,469 @@ class CacheTest(unittest.TestCase):
        self.assertTrue(cached_keys.shape == (1, 1, 10, 128))
        self.assertTrue(cached_values.shape == (1, 1, 10, 128))
@require_torch_accelerator
 class CacheIntegrationTest(unittest.TestCase):
    """Cache tests that require loading models"""
    def tearDown(self):
        # Some tests use large models, which might result in suboptimal torch re-allocation if we run multiple tests
        # in a row
        cleanup(torch_device, gc_collect=True)
    @slow
    def test_dynamic_cache_hard(self):
        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", padding_side="left")
        model = AutoModelForCausalLM.from_pretrained(
            "meta-llama/Llama-2-7b-hf", device_map="auto", torch_dtype=torch.float16
        )
        inputs = tokenizer(["Here's everything I know about cats. Cats"], return_tensors="pt").to(model.device)
        # DynamicCache and the legacy cache format should be equivalent
        set_seed(0)
        gen_out_legacy = model.generate(**inputs, do_sample=True, max_new_tokens=256)
        set_seed(0)
        gen_out = model.generate(**inputs, do_sample=True, max_new_tokens=256, past_key_values=DynamicCache())
        self.assertListEqual(gen_out_legacy.tolist(), gen_out.tolist())
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        expected_text = (
            "Here's everything I know about cats. Cats are mysterious creatures. They can't talk, and they don't like "
            "to be held. They don't play fetch, and they don't like to be hugged. But they do like to be petted.\n"
            "Cats are also very independent. They don't like to be told what to do, and they don't like to be told "
            "what to eat. They are also very territorial. They don't like to share their food or their toys.\nCats "
            "are also very curious. They like to explore, and they like to play. They are also very fast. They can "
            "run very fast, and they can jump very high.\nCats are also very smart. They can learn tricks, and they "
            "can solve problems. They are also very playful. They like to play with toys, and they like to play with "
            "other cats.\nCats are also very affectionate. They like to be petted, and they like to be held. They "
            "also like to be scratched.\nCats are also very clean. They like to groom themselves, and they like to "
            "clean their litter box.\nCats are also very independent. They don't"
        )
        self.assertEqual(decoded[0], expected_text)
    @slow
    def test_dynamic_cache_batched(self):
        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", padding_side="left")
        tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForCausalLM.from_pretrained(
            "meta-llama/Llama-2-7b-hf", device_map="auto", torch_dtype=torch.float16
        )
        inputs = tokenizer(["A sequence: 1, 2, 3, 4, 5", "A sequence: A, B, C"], padding=True, return_tensors="pt").to(
            model.device
        )
        gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10, past_key_values=DynamicCache())
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        expected_text = ["A sequence: 1, 2, 3, 4, 5, 6, 7, 8,", "A sequence: A, B, C, D, E, F, G, H"]
        self.assertListEqual(decoded, expected_text)
    @slow
    def test_dynamic_cache_beam_search(self):
        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", padding_side="left")
        model = AutoModelForCausalLM.from_pretrained(
            "meta-llama/Llama-2-7b-hf", device_map="auto", torch_dtype=torch.float16
        )
        inputs = tokenizer(["The best color is"], return_tensors="pt").to(model.device)
        gen_out = model.generate(
            **inputs,
            do_sample=False,
            max_new_tokens=20,
            num_beams=2,
            num_return_sequences=2,
        )
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        expected_text = [
            "The best color is the one that makes you feel good.\nThe best color is the one that makes you feel good",
            "The best color is the one that suits you.\nThe best color is the one that suits you. The",
        ]
        self.assertListEqual(decoded, expected_text)
    @slow
    def test_hybrid_cache_n_sequences(self):
        tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
        model = AutoModelForCausalLM.from_pretrained(
            "google/gemma-2-9b",
            device_map="auto",
            torch_dtype=torch.bfloat16,
            attn_implementation="eager",
        )
        inputs = tokenizer(["Hello I am doing"], return_tensors="pt").to(model.device)
        gen_out = model.generate(
            **inputs,
            do_sample=False,
            max_new_tokens=20,
            num_return_sequences=2,
            num_beams=2,
        )
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        expected_text = [
            "Hello I am doing a project for my school and I am trying to make a program that will allow me to input a",
            "Hello I am doing a project for my school and I am trying to make a program that will allow me to use a",
        ]
        self.assertListEqual(decoded, expected_text)
    @require_non_xpu
    @require_gptq
    @slow
    def test_sink_cache_hard(self):
        tokenizer = AutoTokenizer.from_pretrained("TheBloke/LLaMa-7B-GPTQ")
        model = AutoModelForCausalLM.from_pretrained("TheBloke/LLaMa-7B-GPTQ", device_map="auto")
        inputs = tokenizer(["Vaswani et al. (2017) introduced the Transformers"], return_tensors="pt").to(model.device)
        # Set up the SinkCache. Using a small window length to contain computational complexity. If this example is run
        # without a SinkCache, the last few tokens are gibberish (ends in "of the of the of a of a of")
        cache = SinkCache(window_length=508, num_sink_tokens=4)
        gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=3000, past_key_values=cache)
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        self.assertTrue(decoded[0].endswith("to perform a variety of tasks. The Transformer is a neural network"))
    @slow
    def test_sink_cache_iterative_prompts(self):
        """Tests that SinkCache supports more than one new token at once, when shifting the cache"""
        tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
        model = AutoModelForCausalLM.from_pretrained(
            "HuggingFaceH4/zephyr-7b-beta", device_map="auto", torch_dtype=torch.float16
        )
        prompt = (
            "Compose an engaging travel blog post about a recent trip to Hawaii, highlighting cultural experiences "
            "and must-see attractions."
        )
        # Prepare generation settings
        cache = SinkCache(window_length=256, num_sink_tokens=4)
        input_ids = torch.tensor([], device=model.device, dtype=torch.int)
        for _ in range(3):
            # Tokenize the prompt with the correct chat template
            chat = [{"role": "user", "content": prompt}]
            tokenized_chat = tokenizer.apply_chat_template(chat, return_tensors="pt", add_generation_prompt=True).to(
                model.device
            )
            input_ids = torch.cat((input_ids, tokenized_chat), dim=1)
            # Perform the generation
            gen_out = model.generate(
                input_ids, do_sample=False, max_new_tokens=100, past_key_values=cache, use_cache=True
            )
            input_ids = gen_out
        # We went well beyond the cache length
        self.assertTrue(input_ids.shape[1] > cache.get_max_cache_shape() * 1.5)
        # And it still produces a coherent english
        decoded = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
        last_output = (
            "<|assistant|>\nAs the sun began to set over the Pacific Ocean, I found myself standing on the shores of "
            "Waikiki Beach, my heart filled with awe and wonder. I had just returned from a two-week journey to the "
            "beautiful island of Hawaii, and it had been an unforgettable experience filled with cultural experiences "
            "and must-see attractions that left me breathless.\n\nOne of the most memorable experiences of my trip "
            "was visiting the historic district of Honolulu. Here,"
        )
        self.assertTrue(decoded[0].endswith(last_output))
    @parameterized.expand(
        [
            ("eager", "static"),
            ("sdpa", "static"),
        ]
    )
    @require_torch_gpu
    @slow
    def test_static_cache_greedy_decoding_pad_left(self, attn_implementation, cache_implementation):
        EXPECTED_GENERATION = [
            "The best color is the one that complements the skin tone of the",
            "We should not undermind the issues at hand.\nWe should not undermind the issues",
        ]
        tokenizer = AutoTokenizer.from_pretrained(
            "NousResearch/Llama-2-7b-chat-hf", padding_side="left", pad_token="<s>"
        )
        model = AutoModelForCausalLM.from_pretrained(
            "NousResearch/Llama-2-7b-chat-hf",
            torch_dtype=torch.bfloat16,
            attn_implementation=attn_implementation,
        ).to(torch_device)
        inputs = tokenizer(
            ["The best color is", "We should not undermind the issues at hand"], padding=True, return_tensors="pt"
        ).to(model.device)
        set_seed(0)
        gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10)
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        with self.subTest(f"{attn_implementation}, dynamic"):
            self.assertListEqual(decoded, EXPECTED_GENERATION)
        set_seed(0)
        model.generation_config.cache_implementation = cache_implementation
        gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10)
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        with self.subTest(f"{attn_implementation}, static, eager"):
            self.assertListEqual(decoded, EXPECTED_GENERATION)
        set_seed(0)
        model.forward = torch.compile(model.forward)
        gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10)
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        with self.subTest(f"{attn_implementation}, static, compiled"):
            self.assertListEqual(decoded, EXPECTED_GENERATION)
    @slow
    def test_dynamic_cache_extra_left_padding(self):
        """Tests that adding extra left-padding does not affect the generation with the dynamic cache"""
        EXPECTED_GENERATION = [
            "The best color is the one that complements the skin tone of the",
            "We should not undermind the issues at hand.\nWe should not undermind the issues",
        ]
        tokenizer = AutoTokenizer.from_pretrained(
            "NousResearch/Llama-2-7b-chat-hf", padding_side="left", pad_token="<s>"
        )
        model = AutoModelForCausalLM.from_pretrained(
            "NousResearch/Llama-2-7b-chat-hf",
            torch_dtype=torch.bfloat16,
        ).to(torch_device)
        inputs = tokenizer(
            ["The best color is", "We should not undermind the issues at hand"], padding=True, return_tensors="pt"
        ).to(model.device)
        gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10)
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        self.assertListEqual(decoded, EXPECTED_GENERATION)
        # Now with extra left-padding
        inputs_expanded = tokenizer(
            ["The best color is", "We should not undermind the issues at hand"],
            padding=True,
            return_tensors="pt",
            pad_to_multiple_of=32,
        ).to(model.device)
        self.assertTrue(inputs.input_ids.shape[1] < inputs_expanded.input_ids.shape[1])
        gen_out = model.generate(**inputs_expanded, do_sample=False, max_new_tokens=10)
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        self.assertListEqual(decoded, EXPECTED_GENERATION)
    @slow
    def test_static_cache_extra_left_padding(self):
        """Tests that adding extra left-padding does not affect the generation with the static cache"""
        EXPECTED_GENERATION = [
            "The best color is the one that complements the skin tone of the",
            "We should not undermind the issues at hand.\nWe should not undermind the issues",
        ]
        tokenizer = AutoTokenizer.from_pretrained(
            "NousResearch/Llama-2-7b-chat-hf", padding_side="left", pad_token="<s>"
        )
        model = AutoModelForCausalLM.from_pretrained(
            "NousResearch/Llama-2-7b-chat-hf",
            torch_dtype=torch.bfloat16,
        ).to(torch_device)
        inputs = tokenizer(
            ["The best color is", "We should not undermind the issues at hand"], padding=True, return_tensors="pt"
        ).to(model.device)
        model.generation_config.cache_implementation = "static"
        gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10)
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        self.assertListEqual(decoded, EXPECTED_GENERATION)
        # Now with extra left-padding
        inputs_expanded = tokenizer(
            ["The best color is", "We should not undermind the issues at hand"],
            padding=True,
            return_tensors="pt",
            pad_to_multiple_of=32,
        ).to(model.device)
        self.assertTrue(inputs.input_ids.shape[1] < inputs_expanded.input_ids.shape[1])
        gen_out = model.generate(**inputs_expanded, do_sample=False, max_new_tokens=10)
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        self.assertListEqual(decoded, EXPECTED_GENERATION)
    @unittest.skip(reason="TODO @gante static cache's does not support beam search yet")
    def test_static_cache_beam_search(self):
        pass
    @require_torch_accelerator
    @slow
    def test_offloaded_cache_equivalent_to_dynamic_cache(self):
        """Tests that OffloadedCache produces the same result as the default DynamicCache"""
        model_name = "microsoft/Phi-3-mini-4k-instruct"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
        device = model.device
        if not is_torch_greater_or_equal("2.7", accept_dev=True) and device.type == "xpu":
            self.skipTest(reason="This test requires torch >= 2.7 to run on xpu.")
        input_text = "Fun fact:"
        inputs = tokenizer(input_text, return_tensors="pt").to(device)
        common = {
            "num_beams": 4,
            "num_beam_groups": 2,
            "num_return_sequences": 4,
            "diversity_penalty": 1.0,
            "max_new_tokens": 20,
            "early_stopping": True,
        }
        original = GenerationConfig(**common)
        offloaded = GenerationConfig(cache_implementation="offloaded", **common)
        original_outputs = model.generate(generation_config=original, **inputs)
        offloaded_outputs = model.generate(generation_config=offloaded, **inputs)
        for original_output, offloaded_output in zip(original_outputs, offloaded_outputs):
            assert torch.all(original_output == offloaded_output).item()
    @require_torch_accelerator
    @slow
    def test_offloaded_cache_uses_less_memory_than_dynamic_cache(self):
        """Tests that OffloadedCache uses less memory than the default DynamicCache"""
        model_name = "microsoft/Phi-3-mini-4k-instruct"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
        device = model.device
        if not is_torch_greater_or_equal("2.7", accept_dev=True) and device.type == "xpu":
            self.skipTest(reason="This test requires torch >= 2.7 to run on xpu.")
        input_text = "Fun fact:"
        inputs = tokenizer(input_text, return_tensors="pt").to(device)
        common = {
            "num_beams": 4,
            "num_beam_groups": 2,
            "num_return_sequences": 4,
            "diversity_penalty": 1.0,
            "max_new_tokens": 20,
            "early_stopping": True,
        }
        original = GenerationConfig(**common)
        offloaded = GenerationConfig(cache_implementation="offloaded", **common)
        torch_accelerator_module = None
        if device.type == "cuda":
            torch_accelerator_module = torch.cuda
        elif device.type == "xpu":
            torch_accelerator_module = torch.xpu
        torch_accelerator_module.reset_peak_memory_stats(device)
        model.generate(generation_config=original, **inputs)
        original_peak_memory = torch_accelerator_module.max_memory_allocated(device)
        torch_accelerator_module.reset_peak_memory_stats(device)
        model.generate(generation_config=offloaded, **inputs)
        offloaded_peak_memory = torch_accelerator_module.max_memory_allocated(device)
        print(f"original_peak_memory: {original_peak_memory}, offloaded_peak_memory: {offloaded_peak_memory}")
        assert offloaded_peak_memory < original_peak_memory
    @require_torch_gpu
    @slow
    def test_cache_copy(self):
        model_name = "microsoft/Phi-3-mini-4k-instruct"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype=torch.bfloat16)
        prompt_cache = StaticCache(
            config=model.config, max_batch_size=1, max_cache_len=1024, device="cuda", dtype=torch.bfloat16
        )
        INITIAL_PROMPT = "You are a helpful assistant. "
        inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to("cuda")
        # This is the common prompt cached, we need to run forward without grad to be abel to copy
        with torch.no_grad():
            prompt_cache = model(**inputs_initial_prompt, past_key_values=prompt_cache).past_key_values
        prompts = ["Help me to write a blogpost about travelling.", "What is the capital of France?"]
        responses = []
        for prompt in prompts:
            new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to("cuda")
            past_key_values = copy.deepcopy(prompt_cache)
            outputs = model.generate(**new_inputs, past_key_values=past_key_values, max_new_tokens=40)
            response = tokenizer.batch_decode(outputs)[0]
            responses.append(response)
        EXPECTED_DECODED_TEXT = [
            "You are a helpful assistant. Help me to write a blogpost about travelling.\n\nTraveling is an enriching experience that broadens our horizons and exposes us to new cultures, landscapes, and people. Whether it's a week",
            'You are a helpful assistant. What is the capital of France?\n\n\n## Response:Paris is the capital of France.\n\n\n\n\n\n## Query:\n\nIn a detailed analysis, compare the economic impacts of the introduction of the'
        ]  # fmt: skip
        self.assertEqual(responses, EXPECTED_DECODED_TEXT)
    @require_torch_multi_gpu
    def test_data_parallel_dynamic_cache(self):
        """
        Tests that the dynamic cache works with nn.DataParallel. Under the hood, `DynamicCache` is rebuilt from
        multiple `DynamicCache` in the gather step.
        """
        model_repo = "hf-internal-testing/tiny-random-MistralForCausalLM"
        model = AutoModelForCausalLM.from_pretrained(model_repo).to(torch_device)
        tokenizer = AutoTokenizer.from_pretrained(model_repo)
        # w/o DP: batch_size = num_gpu
        # w DP: batch_size = 1 (with num_gpus replicas)
        num_gpus = get_gpu_count()
        model_inputs = tokenizer(["foo bar"] * num_gpus, return_tensors="pt").to(model.device)
        # w/o DP
        no_parallelism_cache = model(**model_inputs).past_key_values
        self.assertIsInstance(no_parallelism_cache, DynamicCache)
        # w DP
        model = torch.nn.DataParallel(model)
        parallelism_cache = model(**model_inputs).past_key_values
        self.assertIsInstance(parallelism_cache, DynamicCache)
        # Check that the caches are the same
        for layer_idx in range(len(no_parallelism_cache)):
            for kv_idx in range(2):  # 0 = key, 1 = value
                torch.testing.assert_close(
                    actual=parallelism_cache[layer_idx][kv_idx], expected=no_parallelism_cache[layer_idx][kv_idx]
                )
    @require_torch_gpu
    def test_static_cache_no_cuda_graph_skips(self):
        """
        Tests generating with static cache and compilation doesn't skip cuda graphs. Regression test for #36543.
        (? We set `fullgraph=True`, which according to torch docs means it should raise an exception. Instead,
        messages are being thrown to stderr?)
        """
        model_repo = "hf-internal-testing/tiny-random-MistralForCausalLM"
        model = AutoModelForCausalLM.from_pretrained(model_repo).to(torch_device)
        tokenizer = AutoTokenizer.from_pretrained(model_repo)
        inputs = tokenizer(["foo bar"], return_tensors="pt").to(torch_device)
        # on `main`, prior to #36543, this would send stderr messages about cuda graphs being skipped.
        with CaptureStderr() as cap:
            model.generate(**inputs, max_new_tokens=2, cache_implementation="static")
        self.assertEqual(cap.err, "")
    @require_torch_multi_gpu
    @slow
    def test_static_cache_multi_gpu(self):
        """Regression test for #35164: static cache with multi-gpu"""
        model_id = "google/gemma-2-2b-it"
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        device_map = {"model.embed_tokens": 0, "model.norm": 1, "model.rotary_emb": 1, "lm_head": 0}
        num_hidden_layers = 26
        for i in range(num_hidden_layers):
            device_map[f"model.layers.{i}"] = 0 if i < 13 else 1
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype="bfloat16",
            device_map=device_map,
        )
        inputs = tokenizer("Today is a beautiful day!", return_tensors="pt").to(0)
        _ = model(**inputs)
        _ = model.generate(**inputs, max_new_tokens=2, cache_implementation="hybrid")
@require_torch
 class CacheExportIntegrationTest(unittest.TestCase):
    """Cache tests that rely on `torch.export()` and model loading"""
    def test_dynamic_cache_exportability(self):
        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM")
        model = model.eval()
@@ -282,485 +748,3 @@ class CacheTest(unittest.TestCase):
                n_static_value_caches = n_static_value_caches + 1
        self.assertEqual(n_static_key_caches, model.config.num_hidden_layers)
        self.assertEqual(n_static_value_caches, model.config.num_hidden_layers)
@require_torch_accelerator
@slow
 class CacheIntegrationTest(unittest.TestCase):
    def test_dynamic_cache_hard(self):
        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", padding_side="left")
        model = AutoModelForCausalLM.from_pretrained(
            "meta-llama/Llama-2-7b-hf", device_map="auto", torch_dtype=torch.float16
        )
        inputs = tokenizer(["Here's everything I know about cats. Cats"], return_tensors="pt").to(model.device)
        # DynamicCache and the legacy cache format should be equivalent
        set_seed(0)
        gen_out_legacy = model.generate(**inputs, do_sample=True, max_new_tokens=256)
        set_seed(0)
        gen_out = model.generate(**inputs, do_sample=True, max_new_tokens=256, past_key_values=DynamicCache())
        self.assertListEqual(gen_out_legacy.tolist(), gen_out.tolist())
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        expected_text = (
            "Here's everything I know about cats. Cats are mysterious creatures. They can't talk, and they don't like "
            "to be held. They don't play fetch, and they don't like to be hugged. But they do like to be petted.\n"
            "Cats are also very independent. They don't like to be told what to do, and they don't like to be told "
            "what to eat. They are also very territorial. They don't like to share their food or their toys.\nCats "
            "are also very curious. They like to explore, and they like to play. They are also very fast. They can "
            "run very fast, and they can jump very high.\nCats are also very smart. They can learn tricks, and they "
            "can solve problems. They are also very playful. They like to play with toys, and they like to play with "
            "other cats.\nCats are also very affectionate. They like to be petted, and they like to be held. They "
            "also like to be scratched.\nCats are also very clean. They like to groom themselves, and they like to "
            "clean their litter box.\nCats are also very independent. They don't"
        )
        self.assertEqual(decoded[0], expected_text)
    def test_dynamic_cache_batched(self):
        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", padding_side="left")
        tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForCausalLM.from_pretrained(
            "meta-llama/Llama-2-7b-hf", device_map="auto", torch_dtype=torch.float16
        )
        inputs = tokenizer(["A sequence: 1, 2, 3, 4, 5", "A sequence: A, B, C"], padding=True, return_tensors="pt").to(
            model.device
        )
        gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10, past_key_values=DynamicCache())
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        expected_text = ["A sequence: 1, 2, 3, 4, 5, 6, 7, 8,", "A sequence: A, B, C, D, E, F, G, H"]
        self.assertListEqual(decoded, expected_text)
    def test_dynamic_cache_beam_search(self):
        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", padding_side="left")
        model = AutoModelForCausalLM.from_pretrained(
            "meta-llama/Llama-2-7b-hf", device_map="auto", torch_dtype=torch.float16
        )
        inputs = tokenizer(["The best color is"], return_tensors="pt").to(model.device)
        gen_out = model.generate(
            **inputs,
            do_sample=False,
            max_new_tokens=20,
            num_beams=2,
            num_return_sequences=2,
        )
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        expected_text = [
            "The best color is the one that makes you feel good.\nThe best color is the one that makes you feel good",
            "The best color is the one that suits you.\nThe best color is the one that suits you. The",
        ]
        self.assertListEqual(decoded, expected_text)
    def test_hybrid_cache_n_sequences(self):
        tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
        model = AutoModelForCausalLM.from_pretrained(
            "google/gemma-2-9b",
            device_map="auto",
            torch_dtype=torch.bfloat16,
            attn_implementation="eager",
        )
        inputs = tokenizer(["Hello I am doing"], return_tensors="pt").to(model.device)
        gen_out = model.generate(
            **inputs,
            do_sample=False,
            max_new_tokens=20,
            num_return_sequences=2,
            num_beams=2,
        )
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        expected_text = [
            "Hello I am doing a project for my school and I am trying to make a program that will allow me to input a",
            "Hello I am doing a project for my school and I am trying to make a program that will allow me to use a",
        ]
        self.assertListEqual(decoded, expected_text)
    @require_non_xpu
    @require_gptq
    def test_sink_cache_hard(self):
        tokenizer = AutoTokenizer.from_pretrained("TheBloke/LLaMa-7B-GPTQ")
        model = AutoModelForCausalLM.from_pretrained("TheBloke/LLaMa-7B-GPTQ", device_map="auto")
        inputs = tokenizer(["Vaswani et al. (2017) introduced the Transformers"], return_tensors="pt").to(model.device)
        # Set up the SinkCache. Using a small window length to contain computational complexity. If this example is run
        # without a SinkCache, the last few tokens are gibberish (ends in "of the of the of a of a of")
        cache = SinkCache(window_length=508, num_sink_tokens=4)
        gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=3000, past_key_values=cache)
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        self.assertTrue(decoded[0].endswith("to perform a variety of tasks. The Transformer is a neural network"))
    def test_sink_cache_iterative_prompts(self):
        """Tests that SinkCache supports more than one new token at once, when shifting the cache"""
        tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
        model = AutoModelForCausalLM.from_pretrained(
            "HuggingFaceH4/zephyr-7b-beta", device_map="auto", torch_dtype=torch.float16
        )
        prompt = (
            "Compose an engaging travel blog post about a recent trip to Hawaii, highlighting cultural experiences "
            "and must-see attractions."
        )
        # Prepare generation settings
        cache = SinkCache(window_length=256, num_sink_tokens=4)
        input_ids = torch.tensor([], device=model.device, dtype=torch.int)
        for _ in range(3):
            # Tokenize the prompt with the correct chat template
            chat = [{"role": "user", "content": prompt}]
            tokenized_chat = tokenizer.apply_chat_template(chat, return_tensors="pt", add_generation_prompt=True).to(
                model.device
            )
            input_ids = torch.cat((input_ids, tokenized_chat), dim=1)
            # Perform the generation
            gen_out = model.generate(
                input_ids, do_sample=False, max_new_tokens=100, past_key_values=cache, use_cache=True
            )
            input_ids = gen_out
        # We went well beyond the cache length
        self.assertTrue(input_ids.shape[1] > cache.get_max_cache_shape() * 1.5)
        # And it still produces a coherent english
        decoded = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
        last_output = (
            "<|assistant|>\nAs the sun began to set over the Pacific Ocean, I found myself standing on the shores of "
            "Waikiki Beach, my heart filled with awe and wonder. I had just returned from a two-week journey to the "
            "beautiful island of Hawaii, and it had been an unforgettable experience filled with cultural experiences "
            "and must-see attractions that left me breathless.\n\nOne of the most memorable experiences of my trip "
            "was visiting the historic district of Honolulu. Here,"
        )
        self.assertTrue(decoded[0].endswith(last_output))
    @require_torch_gpu
    @parameterized.expand(
        [
            ("eager", "static"),
            ("sdpa", "static"),
        ]
    )
    def test_static_cache_greedy_decoding_pad_left(self, attn_implementation, cache_implementation):
        EXPECTED_GENERATION = [
            "The best color is the one that complements the skin tone of the",
            "We should not undermind the issues at hand.\nWe should not undermind the issues",
        ]
        tokenizer = AutoTokenizer.from_pretrained(
            "NousResearch/Llama-2-7b-chat-hf", padding_side="left", pad_token="<s>"
        )
        model = AutoModelForCausalLM.from_pretrained(
            "NousResearch/Llama-2-7b-chat-hf",
            torch_dtype=torch.bfloat16,
            attn_implementation=attn_implementation,
        ).to(torch_device)
        inputs = tokenizer(
            ["The best color is", "We should not undermind the issues at hand"], padding=True, return_tensors="pt"
        ).to(model.device)
        set_seed(0)
        gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10)
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        with self.subTest(f"{attn_implementation}, dynamic"):
            self.assertListEqual(decoded, EXPECTED_GENERATION)
        set_seed(0)
        model.generation_config.cache_implementation = cache_implementation
        gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10)
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        with self.subTest(f"{attn_implementation}, static, eager"):
            self.assertListEqual(decoded, EXPECTED_GENERATION)
        set_seed(0)
        model.forward = torch.compile(model.forward)
        gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10)
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        with self.subTest(f"{attn_implementation}, static, compiled"):
            self.assertListEqual(decoded, EXPECTED_GENERATION)
    @require_torch_gpu
    @parameterized.expand(
        [
            ("eager", "static"),
            ("sdpa", "static"),
        ]
    )
    def test_static_cache_greedy_decoding_pad_right(self, attn_implementation, cache_implementation):
        EXPECTED_GENERATION = [
            "The best color isЋ the one that complements the skin tone of",
            "We should not undermind the issues at hand.\nWe should not undermind the issues",
        ]
        tokenizer = AutoTokenizer.from_pretrained(
            "NousResearch/Llama-2-7b-chat-hf", padding_side="right", pad_token="<s>"
        )
        model = AutoModelForCausalLM.from_pretrained(
            "NousResearch/Llama-2-7b-chat-hf",
            torch_dtype=torch.bfloat16,
            attn_implementation=attn_implementation,
        ).to(torch_device)
        inputs = tokenizer(
            ["The best color is", "We should not undermind the issues at hand"], padding=True, return_tensors="pt"
        ).to(model.device)
        set_seed(0)
        gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10)
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        with self.subTest(f"{attn_implementation}, dynamic"):
            self.assertListEqual(decoded, EXPECTED_GENERATION)
        set_seed(0)
        model.generation_config.cache_implementation = cache_implementation
        gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10)
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        with self.subTest(f"{attn_implementation}, static, eager"):
            self.assertListEqual(decoded, EXPECTED_GENERATION)
    def test_dynamic_cache_extra_left_padding(self):
        """Tests that adding extra left-padding does not affect the generation with the dynamic cache"""
        EXPECTED_GENERATION = [
            "The best color is the one that complements the skin tone of the",
            "We should not undermind the issues at hand.\nWe should not undermind the issues",
        ]
        tokenizer = AutoTokenizer.from_pretrained(
            "NousResearch/Llama-2-7b-chat-hf", padding_side="left", pad_token="<s>"
        )
        model = AutoModelForCausalLM.from_pretrained(
            "NousResearch/Llama-2-7b-chat-hf",
            torch_dtype=torch.bfloat16,
        ).to(torch_device)
        inputs = tokenizer(
            ["The best color is", "We should not undermind the issues at hand"], padding=True, return_tensors="pt"
        ).to(model.device)
        gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10)
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        self.assertListEqual(decoded, EXPECTED_GENERATION)
        # Now with extra left-padding
        inputs_expanded = tokenizer(
            ["The best color is", "We should not undermind the issues at hand"],
            padding=True,
            return_tensors="pt",
            pad_to_multiple_of=32,
        ).to(model.device)
        self.assertTrue(inputs.input_ids.shape[1] < inputs_expanded.input_ids.shape[1])
        gen_out = model.generate(**inputs_expanded, do_sample=False, max_new_tokens=10)
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        self.assertListEqual(decoded, EXPECTED_GENERATION)
    @parameterized.expand(
        [
            "static",
        ]
    )
    def test_static_cache_extra_left_padding(self, cache_implementation):
        """Tests that adding extra left-padding does not affect the generation with the static cache"""
        EXPECTED_GENERATION = [
            "The best color is the one that complements the skin tone of the",
            "We should not undermind the issues at hand.\nWe should not undermind the issues",
        ]
        tokenizer = AutoTokenizer.from_pretrained(
            "NousResearch/Llama-2-7b-chat-hf", padding_side="left", pad_token="<s>"
        )
        model = AutoModelForCausalLM.from_pretrained(
            "NousResearch/Llama-2-7b-chat-hf",
            torch_dtype=torch.bfloat16,
        ).to(torch_device)
        inputs = tokenizer(
            ["The best color is", "We should not undermind the issues at hand"], padding=True, return_tensors="pt"
        ).to(model.device)
        model.generation_config.cache_implementation = cache_implementation
        gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10)
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        self.assertListEqual(decoded, EXPECTED_GENERATION)
        # Now with extra left-padding
        inputs_expanded = tokenizer(
            ["The best color is", "We should not undermind the issues at hand"],
            padding=True,
            return_tensors="pt",
            pad_to_multiple_of=32,
        ).to(model.device)
        self.assertTrue(inputs.input_ids.shape[1] < inputs_expanded.input_ids.shape[1])
        gen_out = model.generate(**inputs_expanded, do_sample=False, max_new_tokens=10)
        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
        self.assertListEqual(decoded, EXPECTED_GENERATION)
    @unittest.skip(reason="TODO @gante static cache's does not support beam search yet")
    def test_static_cache_beam_search(self):
        pass
    @require_torch_accelerator
    def test_offloaded_cache_equivalent_to_dynamic_cache(self):
        """Tests that OffloadedCache produces the same result as the default DynamicCache"""
        model_name = "microsoft/Phi-3-mini-4k-instruct"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
        device = model.device
        if not is_torch_greater_or_equal("2.7", accept_dev=True) and device.type == "xpu":
            self.skipTest(reason="This test requires torch >= 2.7 to run on xpu.")
        input_text = "Fun fact:"
        inputs = tokenizer(input_text, return_tensors="pt").to(device)
        common = {
            "num_beams": 4,
            "num_beam_groups": 2,
            "num_return_sequences": 4,
            "diversity_penalty": 1.0,
            "max_new_tokens": 20,
            "early_stopping": True,
        }
        original = GenerationConfig(**common)
        offloaded = GenerationConfig(cache_implementation="offloaded", **common)
        original_outputs = model.generate(generation_config=original, **inputs)
        offloaded_outputs = model.generate(generation_config=offloaded, **inputs)
        for original_output, offloaded_output in zip(original_outputs, offloaded_outputs):
            assert torch.all(original_output == offloaded_output).item()
    @require_torch_accelerator
    def test_offloaded_cache_uses_less_memory_than_dynamic_cache(self):
        """Tests that OffloadedCache uses less memory than the default DynamicCache"""
        model_name = "microsoft/Phi-3-mini-4k-instruct"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
        device = model.device
        if not is_torch_greater_or_equal("2.7", accept_dev=True) and device.type == "xpu":
            self.skipTest(reason="This test requires torch >= 2.7 to run on xpu.")
        input_text = "Fun fact:"
        inputs = tokenizer(input_text, return_tensors="pt").to(device)
        common = {
            "num_beams": 4,
            "num_beam_groups": 2,
            "num_return_sequences": 4,
            "diversity_penalty": 1.0,
            "max_new_tokens": 20,
            "early_stopping": True,
        }
        original = GenerationConfig(**common)
        offloaded = GenerationConfig(cache_implementation="offloaded", **common)
        torch_accelerator_module = None
        if device.type == "cuda":
            torch_accelerator_module = torch.cuda
        elif device.type == "xpu":
            torch_accelerator_module = torch.xpu
        torch_accelerator_module.reset_peak_memory_stats(device)
        model.generate(generation_config=original, **inputs)
        original_peak_memory = torch_accelerator_module.max_memory_allocated(device)
        torch_accelerator_module.reset_peak_memory_stats(device)
        model.generate(generation_config=offloaded, **inputs)
        offloaded_peak_memory = torch_accelerator_module.max_memory_allocated(device)
        print(f"original_peak_memory: {original_peak_memory}, offloaded_peak_memory: {offloaded_peak_memory}")
        assert offloaded_peak_memory < original_peak_memory
    @require_torch_gpu
    def test_cache_copy(self):
        model_name = "microsoft/Phi-3-mini-4k-instruct"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype=torch.bfloat16)
        prompt_cache = StaticCache(
            config=model.config, max_batch_size=1, max_cache_len=1024, device="cuda", dtype=torch.bfloat16
        )
        INITIAL_PROMPT = "You are a helpful assistant. "
        inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to("cuda")
        # This is the common prompt cached, we need to run forward without grad to be abel to copy
        with torch.no_grad():
            prompt_cache = model(**inputs_initial_prompt, past_key_values=prompt_cache).past_key_values
        prompts = ["Help me to write a blogpost about travelling.", "What is the capital of France?"]
        responses = []
        for prompt in prompts:
            new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to("cuda")
            past_key_values = copy.deepcopy(prompt_cache)
            outputs = model.generate(**new_inputs, past_key_values=past_key_values, max_new_tokens=40)
            response = tokenizer.batch_decode(outputs)[0]
            responses.append(response)
        EXPECTED_DECODED_TEXT = [
            "You are a helpful assistant. Help me to write a blogpost about travelling.\n\nTraveling is an enriching experience that broadens our horizons and exposes us to new cultures, landscapes, and people. Whether it's a week",
            'You are a helpful assistant. What is the capital of France?\n\n\n## Response:Paris is the capital of France.\n\n\n\n\n\n## Query:\n\nIn a detailed analysis, compare the economic impacts of the introduction of the'
        ]  # fmt: skip
        self.assertEqual(responses, EXPECTED_DECODED_TEXT)
    @require_torch_multi_gpu
    def test_data_parallel_dynamic_cache(self):
        """
        Tests that the dynamic cache works with nn.DataParallel. Under the hood, `DynamicCache` is rebuilt from
        multiple `DynamicCache` in the gather step.
        """
        model_repo = "hf-internal-testing/tiny-random-MistralForCausalLM"
        model = AutoModelForCausalLM.from_pretrained(model_repo).to(torch_device)
        tokenizer = AutoTokenizer.from_pretrained(model_repo)
        # w/o DP: batch_size = num_gpu
        # w DP: batch_size = 1 (with num_gpus replicas)
        num_gpus = get_gpu_count()
        model_inputs = tokenizer(["foo bar"] * num_gpus, return_tensors="pt").to(model.device)
        # w/o DP
        no_parallelism_cache = model(**model_inputs).past_key_values
        self.assertIsInstance(no_parallelism_cache, DynamicCache)
        # w DP
        model = torch.nn.DataParallel(model)
        parallelism_cache = model(**model_inputs).past_key_values
        self.assertIsInstance(parallelism_cache, DynamicCache)
        # Check that the caches are the same
        for layer_idx in range(len(no_parallelism_cache)):
            for kv_idx in range(2):  # 0 = key, 1 = value
                torch.testing.assert_close(
                    actual=parallelism_cache[layer_idx][kv_idx], expected=no_parallelism_cache[layer_idx][kv_idx]
                )
    @require_torch_gpu
    def test_static_cache_no_cuda_graph_skips(self):
        """
        Tests generating with static cache and compilation doesn't skip cuda graphs. Regression test for #36543.
        (? We set `fullgraph=True`, which according to torch docs means it should raise an exception. Instead,
        messages are being thrown to stderr?)
        """
        model_repo = "hf-internal-testing/tiny-random-MistralForCausalLM"
        model = AutoModelForCausalLM.from_pretrained(model_repo).to(torch_device)
        tokenizer = AutoTokenizer.from_pretrained(model_repo)
        inputs = tokenizer(["foo bar"], return_tensors="pt").to(torch_device)
        # on `main`, prior to #36543, this would send stderr messages about cuda graphs being skipped.
        with CaptureStderr() as cap:
            model.generate(**inputs, max_new_tokens=2, cache_implementation="static")
        self.assertEqual(cap.err, "")
    @require_torch_multi_gpu
    def test_static_cache_multi_gpu(self):
        """Regression test for #35164: static cache with multi-gpu"""
        model_id = "google/gemma-2-2b-it"
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        device_map = {"model.embed_tokens": 0, "model.norm": 1, "model.rotary_emb": 1, "lm_head": 0}
        num_hidden_layers = 26
        for i in range(num_hidden_layers):
            device_map[f"model.layers.{i}"] = 0 if i < 13 else 1
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype="bfloat16",
            device_map=device_map,
        )
        inputs = tokenizer("Today is a beautiful day!", return_tensors="pt").to(0)
        _ = model(**inputs)
        _ = model.generate(**inputs, max_new_tokens=2, cache_implementation="hybrid")