From 4acf692acebf4b9f25613d5ee410259bad761594 Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Thu, 17 Apr 2025 23:08:24 +0200 Subject: [PATCH] Update Phi4 converter (#37594) * fix converter * Update phi4_multimodal.md --- docs/source/en/model_doc/phi4_multimodal.md | 5 ++--- .../convert_phi4_multimodal_weights_to_hf.py | 13 +++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/docs/source/en/model_doc/phi4_multimodal.md b/docs/source/en/model_doc/phi4_multimodal.md index 3fa2b61cc9..22b55792f6 100644 --- a/docs/source/en/model_doc/phi4_multimodal.md +++ b/docs/source/en/model_doc/phi4_multimodal.md @@ -64,7 +64,7 @@ inputs = processor.apply_chat_template( tokenize=True, return_dict=True, return_tensors="pt", -).to(device, torch.float16) +).to(device) # Generate response generate_ids = model.generate( @@ -98,8 +98,7 @@ inputs = processor.apply_chat_template( tokenize=True, return_dict=True, return_tensors="pt", - sample_rate=sample_rate, -).to(device, torch.float16) +).to(device) generate_ids = model.generate( **inputs, diff --git a/src/transformers/models/phi4_multimodal/convert_phi4_multimodal_weights_to_hf.py b/src/transformers/models/phi4_multimodal/convert_phi4_multimodal_weights_to_hf.py index 65ced8db26..b1a4ac90ac 100644 --- a/src/transformers/models/phi4_multimodal/convert_phi4_multimodal_weights_to_hf.py +++ b/src/transformers/models/phi4_multimodal/convert_phi4_multimodal_weights_to_hf.py @@ -170,12 +170,25 @@ def convert_and_save_processor(input_dir: str, output_dir: str): """Convert the processor.""" original_processor = AutoProcessor.from_pretrained(input_dir, trust_remote_code=True) original_processor.tokenizer.extra_special_tokens = {"image_token": "<|image|>", "audio_token": "<|audio|>"} + # We need to add those temporarily to instantiate the processor + original_processor.tokenizer.image_token = "<|image|>" + original_processor.tokenizer.audio_token = "<|audio|>" + original_processor.tokenizer.image_token_id = 200010 + original_processor.tokenizer.audio_token_id = 200011 + converted_processor = Phi4MultimodalProcessor( tokenizer=original_processor.tokenizer, image_processor=Phi4MultimodalImageProcessorFast(), audio_processor=Phi4MultimodalFeatureExtractor(), chat_template=CHAT_TEMPLATE, ) + # We remove them before saving to avoid polluting somehow + del converted_processor.tokenizer.image_token + del converted_processor.tokenizer.image_token_id + del converted_processor.tokenizer.audio_token + del converted_processor.tokenizer.audio_token_id + + # Save the processor converted_processor.save_pretrained(output_dir) # we need to rename a few tokens but tokenizers doesn't allow doing that programatically