Support input_embeds in torch exportable decoders (#39836)

* Support input_embeds in torch exportable decoders * Hybrid cache update * Manually change some callsites * AI changes the rest of the call sites * Make either input_ids/inputs_embeds mandatory * Clean up * Ruff check --fix * Fix test * pr review * Revert config/generation_config changes * Ruff check
2025-08-07 01:51:31 -07:00
parent cdeaad96b7
commit 6121e9e46c
11 changed files with 325 additions and 85 deletions
--- a/tests/models/gemma/test_modeling_gemma.py
+++ b/tests/models/gemma/test_modeling_gemma.py
@@ -460,7 +460,10 @@ class GemmaIntegrationTest(unittest.TestCase):
        from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM

        exportable_module = TorchExportableModuleForDecoderOnlyLM(model)
-        exported_program = exportable_module.export()
+        exported_program = exportable_module.export(
+            input_ids=prompt_token_ids,
+            cache_position=torch.arange(prompt_token_ids.shape[-1], dtype=torch.long, device=model.device),
+        )
        ep_generated_ids = TorchExportableModuleWithStaticCache.generate(
            exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens
        )
--- a/tests/models/gemma2/test_modeling_gemma2.py
+++ b/tests/models/gemma2/test_modeling_gemma2.py
@@ -365,7 +365,10 @@ class Gemma2IntegrationTest(unittest.TestCase):
        from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM

        exportable_module = TorchExportableModuleForDecoderOnlyLM(model)
-        exported_program = exportable_module.export()
+        exported_program = exportable_module.export(
+            input_ids=prompt_token_ids,
+            cache_position=torch.arange(prompt_token_ids.shape[-1], dtype=torch.long, device=model.device),
+        )
        ep_generated_ids = TorchExportableModuleWithStaticCache.generate(
            exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens
        )
@@ -389,7 +392,10 @@ class Gemma2IntegrationTest(unittest.TestCase):
        # Export + HybridCache
        model.eval()
        exportable_module = TorchExportableModuleForDecoderOnlyLM(model)
-        exported_program = exportable_module.export()
+        exported_program = exportable_module.export(
+            input_ids=torch.tensor([[1]], dtype=torch.long, device=model.device),
+            cache_position=torch.tensor([0], dtype=torch.long, device=model.device),
+        )

        # Test generation with the exported model
        prompt = "What is the capital of France?"
--- a/tests/models/gemma3/test_modeling_gemma3.py
+++ b/tests/models/gemma3/test_modeling_gemma3.py
@@ -822,7 +822,10 @@ class Gemma3IntegrationTest(unittest.TestCase):
        # Export + HybridCache
        model.eval()
        exportable_module = TorchExportableModuleForDecoderOnlyLM(model)
-        exported_program = exportable_module.export()
+        exported_program = exportable_module.export(
+            input_ids=torch.tensor([[1]], dtype=torch.long, device=model.device),
+            cache_position=torch.tensor([0], dtype=torch.long, device=model.device),
+        )
        logging.info(f"\nExported program: {exported_program}")

        # Test generation with the exported model
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -353,7 +353,10 @@ class LlamaIntegrationTest(unittest.TestCase):
            from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM

            exportable_module = TorchExportableModuleForDecoderOnlyLM(model)
-            exported_program = exportable_module.export()
+            exported_program = exportable_module.export(
+                input_ids=prompt_token_ids,
+                cache_position=torch.arange(prompt_token_ids.shape[-1], dtype=torch.long, device=model.device),
+            )
            ep_generated_ids = TorchExportableModuleWithStaticCache.generate(
                exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens
            )
--- a/tests/models/olmo/test_modeling_olmo.py
+++ b/tests/models/olmo/test_modeling_olmo.py
@@ -384,7 +384,10 @@ class OlmoIntegrationTest(unittest.TestCase):
        from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM

        exportable_module = TorchExportableModuleForDecoderOnlyLM(model)
-        exported_program = exportable_module.export()
+        exported_program = exportable_module.export(
+            input_ids=prompt_token_ids,
+            cache_position=torch.arange(prompt_token_ids.shape[-1], dtype=torch.long, device=model.device),
+        )
        ep_generated_ids = TorchExportableModuleWithStaticCache.generate(
            exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens
        )
--- a/tests/models/phi3/test_modeling_phi3.py
+++ b/tests/models/phi3/test_modeling_phi3.py
@@ -417,7 +417,10 @@ class Phi3IntegrationTest(unittest.TestCase):
        from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM

        exportable_module = TorchExportableModuleForDecoderOnlyLM(model)
-        exported_program = exportable_module.export()
+        exported_program = exportable_module.export(
+            input_ids=prompt_token_ids,
+            cache_position=torch.arange(prompt_token_ids.shape[-1], dtype=torch.long, device=model.device),
+        )
        ep_generated_ids = TorchExportableModuleWithStaticCache.generate(
            exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens
        )
--- a/tests/models/qwen2/test_modeling_qwen2.py
+++ b/tests/models/qwen2/test_modeling_qwen2.py
@@ -303,7 +303,11 @@ class Qwen2IntegrationTest(unittest.TestCase):
        strict = version.parse(torch.__version__) != version.parse(
            "2.7.0"
        )  # Due to https://github.com/pytorch/pytorch/issues/150994
-        exported_program = exportable_module.export(strict=strict)
+        exported_program = exportable_module.export(
+            input_ids=prompt_token_ids,
+            cache_position=torch.arange(prompt_token_ids.shape[-1], dtype=torch.long, device=model.device),
+            strict=strict,
+        )
        ep_generated_ids = TorchExportableModuleWithStaticCache.generate(
            exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens
        )
--- a/tests/models/qwen3/test_modeling_qwen3.py
+++ b/tests/models/qwen3/test_modeling_qwen3.py
@@ -293,7 +293,11 @@ class Qwen3IntegrationTest(unittest.TestCase):
        from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM

        exportable_module = TorchExportableModuleForDecoderOnlyLM(model)
-        exported_program = exportable_module.export(strict=strict)
+        exported_program = exportable_module.export(
+            input_ids=prompt_token_ids,
+            cache_position=torch.arange(prompt_token_ids.shape[-1], dtype=torch.long, device=model.device),
+            strict=strict,
+        )
        ep_generated_ids = TorchExportableModuleWithStaticCache.generate(
            exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens
        )