Export SmolvLM (#39614)

Export SmolVLM for ExecuTorch
2025-08-05 07:20:23 -07:00
parent c430047602
commit d2ae766836
5 changed files with 282 additions and 18 deletions
--- a/tests/models/smolvlm/test_modeling_smolvlm.py
+++ b/tests/models/smolvlm/test_modeling_smolvlm.py
@@ -595,3 +595,80 @@ class SmolVLMForConditionalGenerationIntegrationTest(unittest.TestCase):

        expected_generated_text = 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video depicts a large language model architecture, specifically a language model with a "quick brown" feature'  # fmt: skip
        self.assertEqual(generated_texts[0], expected_generated_text)
+
+    @slow
+    def test_export_smolvlm_vision_encoder(self):
+        from transformers import AutoConfig
+        from transformers.integrations.executorch import TorchExportableModuleForVLM
+
+        model_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
+
+        # NOTE: The attention_mask is prepared internally in the vision encoder, depending on whether flash attention is used or not
+        # For ExecuTorch, flash attention is not supported, so the way of exporting vison encoder should be compatible with text-decoder
+        config = AutoConfig.from_pretrained(model_id)
+        config.text_config._flash_attn_2_enabled = False
+
+        # Load model and extract vision encoder
+        model = SmolVLMForConditionalGeneration.from_pretrained(
+            model_id,
+            torch_dtype=torch.float32,
+            config=config,
+        )
+
+        exportable_module = TorchExportableModuleForVLM(model)
+        exported_program = exportable_module.export_vision_encoder()
+        self.assertIsInstance(exported_program, torch.export.ExportedProgram)
+
+    @slow
+    def test_export_smolvlm_connector(self):
+        from transformers import AutoConfig
+        from transformers.integrations.executorch import TorchExportableModuleForVLM
+
+        model_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
+
+        # NOTE: The attention_mask is prepared internally in the vision encoder, depending on whether flash attention is used or not
+        # For ExecuTorch, flash attention is not supported, so the way of exporting vison encoder should be compatible with text-decoder
+        config = AutoConfig.from_pretrained(model_id)
+        config.text_config._flash_attn_2_enabled = False
+
+        # Load the model and extract the connector (multi-modal projector)
+        model = SmolVLMForConditionalGeneration.from_pretrained(
+            model_id,
+            torch_dtype=torch.float32,
+            config=config,
+        )
+
+        connector = model.model.connector
+        connector.eval()
+
+        exportable_module = TorchExportableModuleForVLM(model)
+        exported_program = exportable_module.export_connector()
+        self.assertIsInstance(exported_program, torch.export.ExportedProgram)
+
+    @slow
+    def test_export_smolvlm_text_decoder(self):
+        from transformers import AutoConfig
+        from transformers.integrations.executorch import TorchExportableModuleForVLM
+
+        model_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
+
+        # NOTE: The attention_mask is prepared internally in the vision encoder, depending on whether flash attention is used or not
+        # For ExecuTorch, flash attention is not supported, so the way of exporting vison encoder should be compatible with text-decoder
+        config = AutoConfig.from_pretrained(model_id)
+        config.text_config._flash_attn_2_enabled = False
+        config.text_config.use_cache = True
+        config.text_config.attn_implementation = "sdpa"
+
+        # Load the model and extract the text decoder
+        model = SmolVLMForConditionalGeneration.from_pretrained(
+            model_id,
+            torch_dtype=torch.float32,
+            config=config,
+        )
+
+        text_decoder = model.model.text_model
+        text_decoder.eval()
+
+        exportable_module = TorchExportableModuleForVLM(model)
+        exported_program = exportable_module.export_text_decoder()
+        self.assertIsInstance(exported_program, torch.export.ExportedProgram)