From 755b0fa2fe85d13726585609efeac593d394783e Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Tue, 29 Apr 2025 12:21:14 +0100 Subject: [PATCH] [tests] reorganize cache tests and clean memory between tests (#37684) --- tests/utils/test_cache_utils.py | 948 ++++++++++++++++---------------- 1 file changed, 466 insertions(+), 482 deletions(-) diff --git a/tests/utils/test_cache_utils.py b/tests/utils/test_cache_utils.py index 96c757fd8f..e5b43bec92 100644 --- a/tests/utils/test_cache_utils.py +++ b/tests/utils/test_cache_utils.py @@ -20,6 +20,7 @@ from parameterized import parameterized from transformers import set_seed from transformers.testing_utils import ( CaptureStderr, + cleanup, get_gpu_count, is_torch_available, require_gptq, @@ -53,6 +54,8 @@ if is_torch_available(): @require_torch class CacheTest(unittest.TestCase): + """Cache tests that don't require loading models""" + def test_dynamic_cache_retrocompatibility(self): """Tests that we can convert back and forth between the legacy cache format and DynamicCache""" legacy_cache = () @@ -173,6 +176,469 @@ class CacheTest(unittest.TestCase): self.assertTrue(cached_keys.shape == (1, 1, 10, 128)) self.assertTrue(cached_values.shape == (1, 1, 10, 128)) + +@require_torch_accelerator +class CacheIntegrationTest(unittest.TestCase): + """Cache tests that require loading models""" + + def tearDown(self): + # Some tests use large models, which might result in suboptimal torch re-allocation if we run multiple tests + # in a row + cleanup(torch_device, gc_collect=True) + + @slow + def test_dynamic_cache_hard(self): + tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", padding_side="left") + model = AutoModelForCausalLM.from_pretrained( + "meta-llama/Llama-2-7b-hf", device_map="auto", torch_dtype=torch.float16 + ) + inputs = tokenizer(["Here's everything I know about cats. Cats"], return_tensors="pt").to(model.device) + + # DynamicCache and the legacy cache format should be equivalent + set_seed(0) + gen_out_legacy = model.generate(**inputs, do_sample=True, max_new_tokens=256) + set_seed(0) + gen_out = model.generate(**inputs, do_sample=True, max_new_tokens=256, past_key_values=DynamicCache()) + self.assertListEqual(gen_out_legacy.tolist(), gen_out.tolist()) + + decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) + expected_text = ( + "Here's everything I know about cats. Cats are mysterious creatures. They can't talk, and they don't like " + "to be held. They don't play fetch, and they don't like to be hugged. But they do like to be petted.\n" + "Cats are also very independent. They don't like to be told what to do, and they don't like to be told " + "what to eat. They are also very territorial. They don't like to share their food or their toys.\nCats " + "are also very curious. They like to explore, and they like to play. They are also very fast. They can " + "run very fast, and they can jump very high.\nCats are also very smart. They can learn tricks, and they " + "can solve problems. They are also very playful. They like to play with toys, and they like to play with " + "other cats.\nCats are also very affectionate. They like to be petted, and they like to be held. They " + "also like to be scratched.\nCats are also very clean. They like to groom themselves, and they like to " + "clean their litter box.\nCats are also very independent. They don't" + ) + self.assertEqual(decoded[0], expected_text) + + @slow + def test_dynamic_cache_batched(self): + tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", padding_side="left") + tokenizer.pad_token = tokenizer.eos_token + model = AutoModelForCausalLM.from_pretrained( + "meta-llama/Llama-2-7b-hf", device_map="auto", torch_dtype=torch.float16 + ) + inputs = tokenizer(["A sequence: 1, 2, 3, 4, 5", "A sequence: A, B, C"], padding=True, return_tensors="pt").to( + model.device + ) + + gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10, past_key_values=DynamicCache()) + decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) + expected_text = ["A sequence: 1, 2, 3, 4, 5, 6, 7, 8,", "A sequence: A, B, C, D, E, F, G, H"] + self.assertListEqual(decoded, expected_text) + + @slow + def test_dynamic_cache_beam_search(self): + tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", padding_side="left") + model = AutoModelForCausalLM.from_pretrained( + "meta-llama/Llama-2-7b-hf", device_map="auto", torch_dtype=torch.float16 + ) + + inputs = tokenizer(["The best color is"], return_tensors="pt").to(model.device) + gen_out = model.generate( + **inputs, + do_sample=False, + max_new_tokens=20, + num_beams=2, + num_return_sequences=2, + ) + decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) + expected_text = [ + "The best color is the one that makes you feel good.\nThe best color is the one that makes you feel good", + "The best color is the one that suits you.\nThe best color is the one that suits you. The", + ] + self.assertListEqual(decoded, expected_text) + + @slow + def test_hybrid_cache_n_sequences(self): + tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b") + model = AutoModelForCausalLM.from_pretrained( + "google/gemma-2-9b", + device_map="auto", + torch_dtype=torch.bfloat16, + attn_implementation="eager", + ) + + inputs = tokenizer(["Hello I am doing"], return_tensors="pt").to(model.device) + + gen_out = model.generate( + **inputs, + do_sample=False, + max_new_tokens=20, + num_return_sequences=2, + num_beams=2, + ) + decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) + expected_text = [ + "Hello I am doing a project for my school and I am trying to make a program that will allow me to input a", + "Hello I am doing a project for my school and I am trying to make a program that will allow me to use a", + ] + self.assertListEqual(decoded, expected_text) + + @require_non_xpu + @require_gptq + @slow + def test_sink_cache_hard(self): + tokenizer = AutoTokenizer.from_pretrained("TheBloke/LLaMa-7B-GPTQ") + model = AutoModelForCausalLM.from_pretrained("TheBloke/LLaMa-7B-GPTQ", device_map="auto") + + inputs = tokenizer(["Vaswani et al. (2017) introduced the Transformers"], return_tensors="pt").to(model.device) + + # Set up the SinkCache. Using a small window length to contain computational complexity. If this example is run + # without a SinkCache, the last few tokens are gibberish (ends in "of the of the of a of a of") + cache = SinkCache(window_length=508, num_sink_tokens=4) + gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=3000, past_key_values=cache) + decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) + self.assertTrue(decoded[0].endswith("to perform a variety of tasks. The Transformer is a neural network")) + + @slow + def test_sink_cache_iterative_prompts(self): + """Tests that SinkCache supports more than one new token at once, when shifting the cache""" + tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta") + model = AutoModelForCausalLM.from_pretrained( + "HuggingFaceH4/zephyr-7b-beta", device_map="auto", torch_dtype=torch.float16 + ) + prompt = ( + "Compose an engaging travel blog post about a recent trip to Hawaii, highlighting cultural experiences " + "and must-see attractions." + ) + + # Prepare generation settings + cache = SinkCache(window_length=256, num_sink_tokens=4) + input_ids = torch.tensor([], device=model.device, dtype=torch.int) + for _ in range(3): + # Tokenize the prompt with the correct chat template + chat = [{"role": "user", "content": prompt}] + tokenized_chat = tokenizer.apply_chat_template(chat, return_tensors="pt", add_generation_prompt=True).to( + model.device + ) + input_ids = torch.cat((input_ids, tokenized_chat), dim=1) + + # Perform the generation + gen_out = model.generate( + input_ids, do_sample=False, max_new_tokens=100, past_key_values=cache, use_cache=True + ) + input_ids = gen_out + + # We went well beyond the cache length + self.assertTrue(input_ids.shape[1] > cache.get_max_cache_shape() * 1.5) + + # And it still produces a coherent english + decoded = tokenizer.batch_decode(input_ids, skip_special_tokens=True) + last_output = ( + "<|assistant|>\nAs the sun began to set over the Pacific Ocean, I found myself standing on the shores of " + "Waikiki Beach, my heart filled with awe and wonder. I had just returned from a two-week journey to the " + "beautiful island of Hawaii, and it had been an unforgettable experience filled with cultural experiences " + "and must-see attractions that left me breathless.\n\nOne of the most memorable experiences of my trip " + "was visiting the historic district of Honolulu. Here," + ) + self.assertTrue(decoded[0].endswith(last_output)) + + @parameterized.expand( + [ + ("eager", "static"), + ("sdpa", "static"), + ] + ) + @require_torch_gpu + @slow + def test_static_cache_greedy_decoding_pad_left(self, attn_implementation, cache_implementation): + EXPECTED_GENERATION = [ + "The best color is the one that complements the skin tone of the", + "We should not undermind the issues at hand.\nWe should not undermind the issues", + ] + + tokenizer = AutoTokenizer.from_pretrained( + "NousResearch/Llama-2-7b-chat-hf", padding_side="left", pad_token="" + ) + model = AutoModelForCausalLM.from_pretrained( + "NousResearch/Llama-2-7b-chat-hf", + torch_dtype=torch.bfloat16, + attn_implementation=attn_implementation, + ).to(torch_device) + inputs = tokenizer( + ["The best color is", "We should not undermind the issues at hand"], padding=True, return_tensors="pt" + ).to(model.device) + + set_seed(0) + gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10) + decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) + with self.subTest(f"{attn_implementation}, dynamic"): + self.assertListEqual(decoded, EXPECTED_GENERATION) + + set_seed(0) + model.generation_config.cache_implementation = cache_implementation + gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10) + decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) + with self.subTest(f"{attn_implementation}, static, eager"): + self.assertListEqual(decoded, EXPECTED_GENERATION) + + set_seed(0) + model.forward = torch.compile(model.forward) + gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10) + decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) + with self.subTest(f"{attn_implementation}, static, compiled"): + self.assertListEqual(decoded, EXPECTED_GENERATION) + + @slow + def test_dynamic_cache_extra_left_padding(self): + """Tests that adding extra left-padding does not affect the generation with the dynamic cache""" + EXPECTED_GENERATION = [ + "The best color is the one that complements the skin tone of the", + "We should not undermind the issues at hand.\nWe should not undermind the issues", + ] + + tokenizer = AutoTokenizer.from_pretrained( + "NousResearch/Llama-2-7b-chat-hf", padding_side="left", pad_token="" + ) + model = AutoModelForCausalLM.from_pretrained( + "NousResearch/Llama-2-7b-chat-hf", + torch_dtype=torch.bfloat16, + ).to(torch_device) + inputs = tokenizer( + ["The best color is", "We should not undermind the issues at hand"], padding=True, return_tensors="pt" + ).to(model.device) + + gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10) + decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) + self.assertListEqual(decoded, EXPECTED_GENERATION) + + # Now with extra left-padding + inputs_expanded = tokenizer( + ["The best color is", "We should not undermind the issues at hand"], + padding=True, + return_tensors="pt", + pad_to_multiple_of=32, + ).to(model.device) + self.assertTrue(inputs.input_ids.shape[1] < inputs_expanded.input_ids.shape[1]) + gen_out = model.generate(**inputs_expanded, do_sample=False, max_new_tokens=10) + decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) + self.assertListEqual(decoded, EXPECTED_GENERATION) + + @slow + def test_static_cache_extra_left_padding(self): + """Tests that adding extra left-padding does not affect the generation with the static cache""" + EXPECTED_GENERATION = [ + "The best color is the one that complements the skin tone of the", + "We should not undermind the issues at hand.\nWe should not undermind the issues", + ] + + tokenizer = AutoTokenizer.from_pretrained( + "NousResearch/Llama-2-7b-chat-hf", padding_side="left", pad_token="" + ) + model = AutoModelForCausalLM.from_pretrained( + "NousResearch/Llama-2-7b-chat-hf", + torch_dtype=torch.bfloat16, + ).to(torch_device) + inputs = tokenizer( + ["The best color is", "We should not undermind the issues at hand"], padding=True, return_tensors="pt" + ).to(model.device) + + model.generation_config.cache_implementation = "static" + + gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10) + decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) + self.assertListEqual(decoded, EXPECTED_GENERATION) + + # Now with extra left-padding + inputs_expanded = tokenizer( + ["The best color is", "We should not undermind the issues at hand"], + padding=True, + return_tensors="pt", + pad_to_multiple_of=32, + ).to(model.device) + self.assertTrue(inputs.input_ids.shape[1] < inputs_expanded.input_ids.shape[1]) + gen_out = model.generate(**inputs_expanded, do_sample=False, max_new_tokens=10) + decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) + self.assertListEqual(decoded, EXPECTED_GENERATION) + + @unittest.skip(reason="TODO @gante static cache's does not support beam search yet") + def test_static_cache_beam_search(self): + pass + + @require_torch_accelerator + @slow + def test_offloaded_cache_equivalent_to_dynamic_cache(self): + """Tests that OffloadedCache produces the same result as the default DynamicCache""" + model_name = "microsoft/Phi-3-mini-4k-instruct" + tokenizer = AutoTokenizer.from_pretrained(model_name) + model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16) + device = model.device + + if not is_torch_greater_or_equal("2.7", accept_dev=True) and device.type == "xpu": + self.skipTest(reason="This test requires torch >= 2.7 to run on xpu.") + + input_text = "Fun fact:" + inputs = tokenizer(input_text, return_tensors="pt").to(device) + common = { + "num_beams": 4, + "num_beam_groups": 2, + "num_return_sequences": 4, + "diversity_penalty": 1.0, + "max_new_tokens": 20, + "early_stopping": True, + } + original = GenerationConfig(**common) + offloaded = GenerationConfig(cache_implementation="offloaded", **common) + original_outputs = model.generate(generation_config=original, **inputs) + offloaded_outputs = model.generate(generation_config=offloaded, **inputs) + for original_output, offloaded_output in zip(original_outputs, offloaded_outputs): + assert torch.all(original_output == offloaded_output).item() + + @require_torch_accelerator + @slow + def test_offloaded_cache_uses_less_memory_than_dynamic_cache(self): + """Tests that OffloadedCache uses less memory than the default DynamicCache""" + model_name = "microsoft/Phi-3-mini-4k-instruct" + tokenizer = AutoTokenizer.from_pretrained(model_name) + model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16) + device = model.device + + if not is_torch_greater_or_equal("2.7", accept_dev=True) and device.type == "xpu": + self.skipTest(reason="This test requires torch >= 2.7 to run on xpu.") + + input_text = "Fun fact:" + inputs = tokenizer(input_text, return_tensors="pt").to(device) + common = { + "num_beams": 4, + "num_beam_groups": 2, + "num_return_sequences": 4, + "diversity_penalty": 1.0, + "max_new_tokens": 20, + "early_stopping": True, + } + original = GenerationConfig(**common) + offloaded = GenerationConfig(cache_implementation="offloaded", **common) + + torch_accelerator_module = None + if device.type == "cuda": + torch_accelerator_module = torch.cuda + elif device.type == "xpu": + torch_accelerator_module = torch.xpu + + torch_accelerator_module.reset_peak_memory_stats(device) + model.generate(generation_config=original, **inputs) + original_peak_memory = torch_accelerator_module.max_memory_allocated(device) + torch_accelerator_module.reset_peak_memory_stats(device) + model.generate(generation_config=offloaded, **inputs) + offloaded_peak_memory = torch_accelerator_module.max_memory_allocated(device) + print(f"original_peak_memory: {original_peak_memory}, offloaded_peak_memory: {offloaded_peak_memory}") + assert offloaded_peak_memory < original_peak_memory + + @require_torch_gpu + @slow + def test_cache_copy(self): + model_name = "microsoft/Phi-3-mini-4k-instruct" + tokenizer = AutoTokenizer.from_pretrained(model_name) + model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype=torch.bfloat16) + + prompt_cache = StaticCache( + config=model.config, max_batch_size=1, max_cache_len=1024, device="cuda", dtype=torch.bfloat16 + ) + + INITIAL_PROMPT = "You are a helpful assistant. " + inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to("cuda") + # This is the common prompt cached, we need to run forward without grad to be abel to copy + with torch.no_grad(): + prompt_cache = model(**inputs_initial_prompt, past_key_values=prompt_cache).past_key_values + + prompts = ["Help me to write a blogpost about travelling.", "What is the capital of France?"] + responses = [] + for prompt in prompts: + new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to("cuda") + past_key_values = copy.deepcopy(prompt_cache) + outputs = model.generate(**new_inputs, past_key_values=past_key_values, max_new_tokens=40) + response = tokenizer.batch_decode(outputs)[0] + responses.append(response) + + EXPECTED_DECODED_TEXT = [ + "You are a helpful assistant. Help me to write a blogpost about travelling.\n\nTraveling is an enriching experience that broadens our horizons and exposes us to new cultures, landscapes, and people. Whether it's a week", + 'You are a helpful assistant. What is the capital of France?\n\n\n## Response:Paris is the capital of France.\n\n\n\n\n\n## Query:\n\nIn a detailed analysis, compare the economic impacts of the introduction of the' + ] # fmt: skip + self.assertEqual(responses, EXPECTED_DECODED_TEXT) + + @require_torch_multi_gpu + def test_data_parallel_dynamic_cache(self): + """ + Tests that the dynamic cache works with nn.DataParallel. Under the hood, `DynamicCache` is rebuilt from + multiple `DynamicCache` in the gather step. + """ + + model_repo = "hf-internal-testing/tiny-random-MistralForCausalLM" + model = AutoModelForCausalLM.from_pretrained(model_repo).to(torch_device) + tokenizer = AutoTokenizer.from_pretrained(model_repo) + + # w/o DP: batch_size = num_gpu + # w DP: batch_size = 1 (with num_gpus replicas) + num_gpus = get_gpu_count() + model_inputs = tokenizer(["foo bar"] * num_gpus, return_tensors="pt").to(model.device) + + # w/o DP + no_parallelism_cache = model(**model_inputs).past_key_values + self.assertIsInstance(no_parallelism_cache, DynamicCache) + + # w DP + model = torch.nn.DataParallel(model) + parallelism_cache = model(**model_inputs).past_key_values + self.assertIsInstance(parallelism_cache, DynamicCache) + + # Check that the caches are the same + for layer_idx in range(len(no_parallelism_cache)): + for kv_idx in range(2): # 0 = key, 1 = value + torch.testing.assert_close( + actual=parallelism_cache[layer_idx][kv_idx], expected=no_parallelism_cache[layer_idx][kv_idx] + ) + + @require_torch_gpu + def test_static_cache_no_cuda_graph_skips(self): + """ + Tests generating with static cache and compilation doesn't skip cuda graphs. Regression test for #36543. + + (? We set `fullgraph=True`, which according to torch docs means it should raise an exception. Instead, + messages are being thrown to stderr?) + """ + model_repo = "hf-internal-testing/tiny-random-MistralForCausalLM" + model = AutoModelForCausalLM.from_pretrained(model_repo).to(torch_device) + tokenizer = AutoTokenizer.from_pretrained(model_repo) + inputs = tokenizer(["foo bar"], return_tensors="pt").to(torch_device) + + # on `main`, prior to #36543, this would send stderr messages about cuda graphs being skipped. + with CaptureStderr() as cap: + model.generate(**inputs, max_new_tokens=2, cache_implementation="static") + self.assertEqual(cap.err, "") + + @require_torch_multi_gpu + @slow + def test_static_cache_multi_gpu(self): + """Regression test for #35164: static cache with multi-gpu""" + + model_id = "google/gemma-2-2b-it" + tokenizer = AutoTokenizer.from_pretrained(model_id) + + device_map = {"model.embed_tokens": 0, "model.norm": 1, "model.rotary_emb": 1, "lm_head": 0} + num_hidden_layers = 26 + for i in range(num_hidden_layers): + device_map[f"model.layers.{i}"] = 0 if i < 13 else 1 + + model = AutoModelForCausalLM.from_pretrained( + model_id, + torch_dtype="bfloat16", + device_map=device_map, + ) + inputs = tokenizer("Today is a beautiful day!", return_tensors="pt").to(0) + _ = model(**inputs) + _ = model.generate(**inputs, max_new_tokens=2, cache_implementation="hybrid") + + +@require_torch +class CacheExportIntegrationTest(unittest.TestCase): + """Cache tests that rely on `torch.export()` and model loading""" + def test_dynamic_cache_exportability(self): model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM") model = model.eval() @@ -282,485 +748,3 @@ class CacheTest(unittest.TestCase): n_static_value_caches = n_static_value_caches + 1 self.assertEqual(n_static_key_caches, model.config.num_hidden_layers) self.assertEqual(n_static_value_caches, model.config.num_hidden_layers) - - -@require_torch_accelerator -@slow -class CacheIntegrationTest(unittest.TestCase): - def test_dynamic_cache_hard(self): - tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", padding_side="left") - model = AutoModelForCausalLM.from_pretrained( - "meta-llama/Llama-2-7b-hf", device_map="auto", torch_dtype=torch.float16 - ) - inputs = tokenizer(["Here's everything I know about cats. Cats"], return_tensors="pt").to(model.device) - - # DynamicCache and the legacy cache format should be equivalent - set_seed(0) - gen_out_legacy = model.generate(**inputs, do_sample=True, max_new_tokens=256) - set_seed(0) - gen_out = model.generate(**inputs, do_sample=True, max_new_tokens=256, past_key_values=DynamicCache()) - self.assertListEqual(gen_out_legacy.tolist(), gen_out.tolist()) - - decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) - expected_text = ( - "Here's everything I know about cats. Cats are mysterious creatures. They can't talk, and they don't like " - "to be held. They don't play fetch, and they don't like to be hugged. But they do like to be petted.\n" - "Cats are also very independent. They don't like to be told what to do, and they don't like to be told " - "what to eat. They are also very territorial. They don't like to share their food or their toys.\nCats " - "are also very curious. They like to explore, and they like to play. They are also very fast. They can " - "run very fast, and they can jump very high.\nCats are also very smart. They can learn tricks, and they " - "can solve problems. They are also very playful. They like to play with toys, and they like to play with " - "other cats.\nCats are also very affectionate. They like to be petted, and they like to be held. They " - "also like to be scratched.\nCats are also very clean. They like to groom themselves, and they like to " - "clean their litter box.\nCats are also very independent. They don't" - ) - self.assertEqual(decoded[0], expected_text) - - def test_dynamic_cache_batched(self): - tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", padding_side="left") - tokenizer.pad_token = tokenizer.eos_token - model = AutoModelForCausalLM.from_pretrained( - "meta-llama/Llama-2-7b-hf", device_map="auto", torch_dtype=torch.float16 - ) - inputs = tokenizer(["A sequence: 1, 2, 3, 4, 5", "A sequence: A, B, C"], padding=True, return_tensors="pt").to( - model.device - ) - - gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10, past_key_values=DynamicCache()) - decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) - expected_text = ["A sequence: 1, 2, 3, 4, 5, 6, 7, 8,", "A sequence: A, B, C, D, E, F, G, H"] - self.assertListEqual(decoded, expected_text) - - def test_dynamic_cache_beam_search(self): - tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", padding_side="left") - model = AutoModelForCausalLM.from_pretrained( - "meta-llama/Llama-2-7b-hf", device_map="auto", torch_dtype=torch.float16 - ) - - inputs = tokenizer(["The best color is"], return_tensors="pt").to(model.device) - gen_out = model.generate( - **inputs, - do_sample=False, - max_new_tokens=20, - num_beams=2, - num_return_sequences=2, - ) - decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) - expected_text = [ - "The best color is the one that makes you feel good.\nThe best color is the one that makes you feel good", - "The best color is the one that suits you.\nThe best color is the one that suits you. The", - ] - self.assertListEqual(decoded, expected_text) - - def test_hybrid_cache_n_sequences(self): - tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b") - model = AutoModelForCausalLM.from_pretrained( - "google/gemma-2-9b", - device_map="auto", - torch_dtype=torch.bfloat16, - attn_implementation="eager", - ) - - inputs = tokenizer(["Hello I am doing"], return_tensors="pt").to(model.device) - - gen_out = model.generate( - **inputs, - do_sample=False, - max_new_tokens=20, - num_return_sequences=2, - num_beams=2, - ) - decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) - expected_text = [ - "Hello I am doing a project for my school and I am trying to make a program that will allow me to input a", - "Hello I am doing a project for my school and I am trying to make a program that will allow me to use a", - ] - self.assertListEqual(decoded, expected_text) - - @require_non_xpu - @require_gptq - def test_sink_cache_hard(self): - tokenizer = AutoTokenizer.from_pretrained("TheBloke/LLaMa-7B-GPTQ") - model = AutoModelForCausalLM.from_pretrained("TheBloke/LLaMa-7B-GPTQ", device_map="auto") - - inputs = tokenizer(["Vaswani et al. (2017) introduced the Transformers"], return_tensors="pt").to(model.device) - - # Set up the SinkCache. Using a small window length to contain computational complexity. If this example is run - # without a SinkCache, the last few tokens are gibberish (ends in "of the of the of a of a of") - cache = SinkCache(window_length=508, num_sink_tokens=4) - gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=3000, past_key_values=cache) - decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) - self.assertTrue(decoded[0].endswith("to perform a variety of tasks. The Transformer is a neural network")) - - def test_sink_cache_iterative_prompts(self): - """Tests that SinkCache supports more than one new token at once, when shifting the cache""" - tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta") - model = AutoModelForCausalLM.from_pretrained( - "HuggingFaceH4/zephyr-7b-beta", device_map="auto", torch_dtype=torch.float16 - ) - prompt = ( - "Compose an engaging travel blog post about a recent trip to Hawaii, highlighting cultural experiences " - "and must-see attractions." - ) - - # Prepare generation settings - cache = SinkCache(window_length=256, num_sink_tokens=4) - input_ids = torch.tensor([], device=model.device, dtype=torch.int) - for _ in range(3): - # Tokenize the prompt with the correct chat template - chat = [{"role": "user", "content": prompt}] - tokenized_chat = tokenizer.apply_chat_template(chat, return_tensors="pt", add_generation_prompt=True).to( - model.device - ) - input_ids = torch.cat((input_ids, tokenized_chat), dim=1) - - # Perform the generation - gen_out = model.generate( - input_ids, do_sample=False, max_new_tokens=100, past_key_values=cache, use_cache=True - ) - input_ids = gen_out - - # We went well beyond the cache length - self.assertTrue(input_ids.shape[1] > cache.get_max_cache_shape() * 1.5) - - # And it still produces a coherent english - decoded = tokenizer.batch_decode(input_ids, skip_special_tokens=True) - last_output = ( - "<|assistant|>\nAs the sun began to set over the Pacific Ocean, I found myself standing on the shores of " - "Waikiki Beach, my heart filled with awe and wonder. I had just returned from a two-week journey to the " - "beautiful island of Hawaii, and it had been an unforgettable experience filled with cultural experiences " - "and must-see attractions that left me breathless.\n\nOne of the most memorable experiences of my trip " - "was visiting the historic district of Honolulu. Here," - ) - self.assertTrue(decoded[0].endswith(last_output)) - - @require_torch_gpu - @parameterized.expand( - [ - ("eager", "static"), - ("sdpa", "static"), - ] - ) - def test_static_cache_greedy_decoding_pad_left(self, attn_implementation, cache_implementation): - EXPECTED_GENERATION = [ - "The best color is the one that complements the skin tone of the", - "We should not undermind the issues at hand.\nWe should not undermind the issues", - ] - - tokenizer = AutoTokenizer.from_pretrained( - "NousResearch/Llama-2-7b-chat-hf", padding_side="left", pad_token="" - ) - model = AutoModelForCausalLM.from_pretrained( - "NousResearch/Llama-2-7b-chat-hf", - torch_dtype=torch.bfloat16, - attn_implementation=attn_implementation, - ).to(torch_device) - inputs = tokenizer( - ["The best color is", "We should not undermind the issues at hand"], padding=True, return_tensors="pt" - ).to(model.device) - - set_seed(0) - gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10) - decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) - with self.subTest(f"{attn_implementation}, dynamic"): - self.assertListEqual(decoded, EXPECTED_GENERATION) - - set_seed(0) - model.generation_config.cache_implementation = cache_implementation - gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10) - decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) - with self.subTest(f"{attn_implementation}, static, eager"): - self.assertListEqual(decoded, EXPECTED_GENERATION) - - set_seed(0) - model.forward = torch.compile(model.forward) - gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10) - decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) - with self.subTest(f"{attn_implementation}, static, compiled"): - self.assertListEqual(decoded, EXPECTED_GENERATION) - - @require_torch_gpu - @parameterized.expand( - [ - ("eager", "static"), - ("sdpa", "static"), - ] - ) - def test_static_cache_greedy_decoding_pad_right(self, attn_implementation, cache_implementation): - EXPECTED_GENERATION = [ - "The best color isЋ the one that complements the skin tone of", - "We should not undermind the issues at hand.\nWe should not undermind the issues", - ] - - tokenizer = AutoTokenizer.from_pretrained( - "NousResearch/Llama-2-7b-chat-hf", padding_side="right", pad_token="" - ) - model = AutoModelForCausalLM.from_pretrained( - "NousResearch/Llama-2-7b-chat-hf", - torch_dtype=torch.bfloat16, - attn_implementation=attn_implementation, - ).to(torch_device) - inputs = tokenizer( - ["The best color is", "We should not undermind the issues at hand"], padding=True, return_tensors="pt" - ).to(model.device) - - set_seed(0) - gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10) - decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) - with self.subTest(f"{attn_implementation}, dynamic"): - self.assertListEqual(decoded, EXPECTED_GENERATION) - - set_seed(0) - model.generation_config.cache_implementation = cache_implementation - gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10) - decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) - with self.subTest(f"{attn_implementation}, static, eager"): - self.assertListEqual(decoded, EXPECTED_GENERATION) - - def test_dynamic_cache_extra_left_padding(self): - """Tests that adding extra left-padding does not affect the generation with the dynamic cache""" - EXPECTED_GENERATION = [ - "The best color is the one that complements the skin tone of the", - "We should not undermind the issues at hand.\nWe should not undermind the issues", - ] - - tokenizer = AutoTokenizer.from_pretrained( - "NousResearch/Llama-2-7b-chat-hf", padding_side="left", pad_token="" - ) - model = AutoModelForCausalLM.from_pretrained( - "NousResearch/Llama-2-7b-chat-hf", - torch_dtype=torch.bfloat16, - ).to(torch_device) - inputs = tokenizer( - ["The best color is", "We should not undermind the issues at hand"], padding=True, return_tensors="pt" - ).to(model.device) - - gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10) - decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) - self.assertListEqual(decoded, EXPECTED_GENERATION) - - # Now with extra left-padding - inputs_expanded = tokenizer( - ["The best color is", "We should not undermind the issues at hand"], - padding=True, - return_tensors="pt", - pad_to_multiple_of=32, - ).to(model.device) - self.assertTrue(inputs.input_ids.shape[1] < inputs_expanded.input_ids.shape[1]) - gen_out = model.generate(**inputs_expanded, do_sample=False, max_new_tokens=10) - decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) - self.assertListEqual(decoded, EXPECTED_GENERATION) - - @parameterized.expand( - [ - "static", - ] - ) - def test_static_cache_extra_left_padding(self, cache_implementation): - """Tests that adding extra left-padding does not affect the generation with the static cache""" - EXPECTED_GENERATION = [ - "The best color is the one that complements the skin tone of the", - "We should not undermind the issues at hand.\nWe should not undermind the issues", - ] - - tokenizer = AutoTokenizer.from_pretrained( - "NousResearch/Llama-2-7b-chat-hf", padding_side="left", pad_token="" - ) - model = AutoModelForCausalLM.from_pretrained( - "NousResearch/Llama-2-7b-chat-hf", - torch_dtype=torch.bfloat16, - ).to(torch_device) - inputs = tokenizer( - ["The best color is", "We should not undermind the issues at hand"], padding=True, return_tensors="pt" - ).to(model.device) - - model.generation_config.cache_implementation = cache_implementation - - gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10) - decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) - self.assertListEqual(decoded, EXPECTED_GENERATION) - - # Now with extra left-padding - inputs_expanded = tokenizer( - ["The best color is", "We should not undermind the issues at hand"], - padding=True, - return_tensors="pt", - pad_to_multiple_of=32, - ).to(model.device) - self.assertTrue(inputs.input_ids.shape[1] < inputs_expanded.input_ids.shape[1]) - gen_out = model.generate(**inputs_expanded, do_sample=False, max_new_tokens=10) - decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) - self.assertListEqual(decoded, EXPECTED_GENERATION) - - @unittest.skip(reason="TODO @gante static cache's does not support beam search yet") - def test_static_cache_beam_search(self): - pass - - @require_torch_accelerator - def test_offloaded_cache_equivalent_to_dynamic_cache(self): - """Tests that OffloadedCache produces the same result as the default DynamicCache""" - model_name = "microsoft/Phi-3-mini-4k-instruct" - tokenizer = AutoTokenizer.from_pretrained(model_name) - model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16) - device = model.device - - if not is_torch_greater_or_equal("2.7", accept_dev=True) and device.type == "xpu": - self.skipTest(reason="This test requires torch >= 2.7 to run on xpu.") - - input_text = "Fun fact:" - inputs = tokenizer(input_text, return_tensors="pt").to(device) - common = { - "num_beams": 4, - "num_beam_groups": 2, - "num_return_sequences": 4, - "diversity_penalty": 1.0, - "max_new_tokens": 20, - "early_stopping": True, - } - original = GenerationConfig(**common) - offloaded = GenerationConfig(cache_implementation="offloaded", **common) - original_outputs = model.generate(generation_config=original, **inputs) - offloaded_outputs = model.generate(generation_config=offloaded, **inputs) - for original_output, offloaded_output in zip(original_outputs, offloaded_outputs): - assert torch.all(original_output == offloaded_output).item() - - @require_torch_accelerator - def test_offloaded_cache_uses_less_memory_than_dynamic_cache(self): - """Tests that OffloadedCache uses less memory than the default DynamicCache""" - model_name = "microsoft/Phi-3-mini-4k-instruct" - tokenizer = AutoTokenizer.from_pretrained(model_name) - model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16) - device = model.device - - if not is_torch_greater_or_equal("2.7", accept_dev=True) and device.type == "xpu": - self.skipTest(reason="This test requires torch >= 2.7 to run on xpu.") - - input_text = "Fun fact:" - inputs = tokenizer(input_text, return_tensors="pt").to(device) - common = { - "num_beams": 4, - "num_beam_groups": 2, - "num_return_sequences": 4, - "diversity_penalty": 1.0, - "max_new_tokens": 20, - "early_stopping": True, - } - original = GenerationConfig(**common) - offloaded = GenerationConfig(cache_implementation="offloaded", **common) - - torch_accelerator_module = None - if device.type == "cuda": - torch_accelerator_module = torch.cuda - elif device.type == "xpu": - torch_accelerator_module = torch.xpu - - torch_accelerator_module.reset_peak_memory_stats(device) - model.generate(generation_config=original, **inputs) - original_peak_memory = torch_accelerator_module.max_memory_allocated(device) - torch_accelerator_module.reset_peak_memory_stats(device) - model.generate(generation_config=offloaded, **inputs) - offloaded_peak_memory = torch_accelerator_module.max_memory_allocated(device) - print(f"original_peak_memory: {original_peak_memory}, offloaded_peak_memory: {offloaded_peak_memory}") - assert offloaded_peak_memory < original_peak_memory - - @require_torch_gpu - def test_cache_copy(self): - model_name = "microsoft/Phi-3-mini-4k-instruct" - tokenizer = AutoTokenizer.from_pretrained(model_name) - model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype=torch.bfloat16) - - prompt_cache = StaticCache( - config=model.config, max_batch_size=1, max_cache_len=1024, device="cuda", dtype=torch.bfloat16 - ) - - INITIAL_PROMPT = "You are a helpful assistant. " - inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to("cuda") - # This is the common prompt cached, we need to run forward without grad to be abel to copy - with torch.no_grad(): - prompt_cache = model(**inputs_initial_prompt, past_key_values=prompt_cache).past_key_values - - prompts = ["Help me to write a blogpost about travelling.", "What is the capital of France?"] - responses = [] - for prompt in prompts: - new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to("cuda") - past_key_values = copy.deepcopy(prompt_cache) - outputs = model.generate(**new_inputs, past_key_values=past_key_values, max_new_tokens=40) - response = tokenizer.batch_decode(outputs)[0] - responses.append(response) - - EXPECTED_DECODED_TEXT = [ - "You are a helpful assistant. Help me to write a blogpost about travelling.\n\nTraveling is an enriching experience that broadens our horizons and exposes us to new cultures, landscapes, and people. Whether it's a week", - 'You are a helpful assistant. What is the capital of France?\n\n\n## Response:Paris is the capital of France.\n\n\n\n\n\n## Query:\n\nIn a detailed analysis, compare the economic impacts of the introduction of the' - ] # fmt: skip - self.assertEqual(responses, EXPECTED_DECODED_TEXT) - - @require_torch_multi_gpu - def test_data_parallel_dynamic_cache(self): - """ - Tests that the dynamic cache works with nn.DataParallel. Under the hood, `DynamicCache` is rebuilt from - multiple `DynamicCache` in the gather step. - """ - - model_repo = "hf-internal-testing/tiny-random-MistralForCausalLM" - model = AutoModelForCausalLM.from_pretrained(model_repo).to(torch_device) - tokenizer = AutoTokenizer.from_pretrained(model_repo) - - # w/o DP: batch_size = num_gpu - # w DP: batch_size = 1 (with num_gpus replicas) - num_gpus = get_gpu_count() - model_inputs = tokenizer(["foo bar"] * num_gpus, return_tensors="pt").to(model.device) - - # w/o DP - no_parallelism_cache = model(**model_inputs).past_key_values - self.assertIsInstance(no_parallelism_cache, DynamicCache) - - # w DP - model = torch.nn.DataParallel(model) - parallelism_cache = model(**model_inputs).past_key_values - self.assertIsInstance(parallelism_cache, DynamicCache) - - # Check that the caches are the same - for layer_idx in range(len(no_parallelism_cache)): - for kv_idx in range(2): # 0 = key, 1 = value - torch.testing.assert_close( - actual=parallelism_cache[layer_idx][kv_idx], expected=no_parallelism_cache[layer_idx][kv_idx] - ) - - @require_torch_gpu - def test_static_cache_no_cuda_graph_skips(self): - """ - Tests generating with static cache and compilation doesn't skip cuda graphs. Regression test for #36543. - - (? We set `fullgraph=True`, which according to torch docs means it should raise an exception. Instead, - messages are being thrown to stderr?) - """ - model_repo = "hf-internal-testing/tiny-random-MistralForCausalLM" - model = AutoModelForCausalLM.from_pretrained(model_repo).to(torch_device) - tokenizer = AutoTokenizer.from_pretrained(model_repo) - inputs = tokenizer(["foo bar"], return_tensors="pt").to(torch_device) - - # on `main`, prior to #36543, this would send stderr messages about cuda graphs being skipped. - with CaptureStderr() as cap: - model.generate(**inputs, max_new_tokens=2, cache_implementation="static") - self.assertEqual(cap.err, "") - - @require_torch_multi_gpu - def test_static_cache_multi_gpu(self): - """Regression test for #35164: static cache with multi-gpu""" - - model_id = "google/gemma-2-2b-it" - tokenizer = AutoTokenizer.from_pretrained(model_id) - - device_map = {"model.embed_tokens": 0, "model.norm": 1, "model.rotary_emb": 1, "lm_head": 0} - num_hidden_layers = 26 - for i in range(num_hidden_layers): - device_map[f"model.layers.{i}"] = 0 if i < 13 else 1 - - model = AutoModelForCausalLM.from_pretrained( - model_id, - torch_dtype="bfloat16", - device_map=device_map, - ) - inputs = tokenizer("Today is a beautiful day!", return_tensors="pt").to(0) - _ = model(**inputs) - _ = model.generate(**inputs, max_new_tokens=2, cache_implementation="hybrid")