From 69bcb86c58ca23b946404ba017b782d44c7a5572 Mon Sep 17 00:00:00 2001 From: Guang Yang <42389959+guangy10@users.noreply.github.com> Date: Tue, 25 Mar 2025 02:46:38 -0700 Subject: [PATCH] Export for Phi4-mini (#36780) * Export for Phi4-mini * Update tests/models/phi3/test_modeling_phi3.py --------- Co-authored-by: Guang Yang Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> --- tests/models/phi3/test_modeling_phi3.py | 70 +++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/tests/models/phi3/test_modeling_phi3.py b/tests/models/phi3/test_modeling_phi3.py index a6a9ab4e63..2edf52db3a 100644 --- a/tests/models/phi3/test_modeling_phi3.py +++ b/tests/models/phi3/test_modeling_phi3.py @@ -21,6 +21,7 @@ from typing import List from parameterized import parameterized from transformers import Phi3Config, StaticCache, is_torch_available, set_seed +from transformers.models.auto.configuration_auto import AutoConfig from transformers.testing_utils import ( require_torch, slow, @@ -707,3 +708,72 @@ class Phi3IntegrationTest(unittest.TestCase): ] self.assertListEqual(output_text, EXPECTED_OUTPUT) + + @slow + def test_export_static_cache(self): + from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_4 + + if not is_torch_greater_or_equal_than_2_4: + self.skipTest(reason="This test requires torch >= 2.4 to run.") + + from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig + from transformers.integrations.executorch import ( + TorchExportableModuleWithStaticCache, + convert_and_export_with_cache, + ) + + model_id = "microsoft/Phi-4-mini-instruct" + + tokenizer = AutoTokenizer.from_pretrained(model_id, pad_token="", padding_side="right") + EXPECTED_TEXT_COMPLETION = [ + "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user. A 45-year-old patient with a 10-year history of type 2 diabetes mellitus, who is currently on metformin and a SGLT2 inhibitor, presents with a 2-year history" + ] + max_generation_length = tokenizer(EXPECTED_TEXT_COMPLETION, return_tensors="pt", padding=True)[ + "input_ids" + ].shape[-1] + + # Load config + config = AutoConfig.from_pretrained(model_id) + # NOTE: To make the model exportable we need to set the rope scaling to default to avoid hitting + # the data-dependent control flow in _longrope_frequency_update. Alternatively, we can rewrite + # that function to avoid the data-dependent control flow. + if hasattr(config, "rope_scaling") and config.rope_scaling is not None: + config.rope_scaling["type"] = "default" + + # Load model + device = "cpu" + dtype = torch.bfloat16 + cache_implementation = "static" + attn_implementation = "sdpa" + batch_size = 1 + model = AutoModelForCausalLM.from_pretrained( + model_id, + config=config, + device_map=device, + torch_dtype=dtype, + attn_implementation=attn_implementation, + generation_config=GenerationConfig( + use_cache=True, + cache_implementation=cache_implementation, + max_length=max_generation_length, + cache_config={ + "batch_size": batch_size, + "max_cache_len": max_generation_length, + }, + ), + ) + + prompt = [ + "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user." + ] + prompt_tokens = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device) + prompt_token_ids = prompt_tokens["input_ids"] + max_new_tokens = max_generation_length - prompt_token_ids.shape[-1] + + # Static Cache + export + exported_program = convert_and_export_with_cache(model) + ep_generated_ids = TorchExportableModuleWithStaticCache.generate( + exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens + ) + ep_generated_text = tokenizer.batch_decode(ep_generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, ep_generated_text)