Update Phi4 converter (#37594)

* fix converter * Update phi4_multimodal.md
2025-04-17 23:08:24 +02:00
parent 40cba20e87
commit 4acf692ace
2 changed files with 15 additions and 3 deletions
--- a/docs/source/en/model_doc/phi4_multimodal.md
+++ b/docs/source/en/model_doc/phi4_multimodal.md
@@ -64,7 +64,7 @@ inputs = processor.apply_chat_template(
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
-).to(device, torch.float16)
+).to(device)
 # Generate response
 generate_ids = model.generate(
@@ -98,8 +98,7 @@ inputs = processor.apply_chat_template(
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
-    sample_rate=sample_rate,
+).to(device)
 ).to(device, torch.float16)
 generate_ids = model.generate(
    **inputs,
--- a/src/transformers/models/phi4_multimodal/convert_phi4_multimodal_weights_to_hf.py
+++ b/src/transformers/models/phi4_multimodal/convert_phi4_multimodal_weights_to_hf.py
@@ -170,12 +170,25 @@ def convert_and_save_processor(input_dir: str, output_dir: str):
    """Convert the processor."""
    original_processor = AutoProcessor.from_pretrained(input_dir, trust_remote_code=True)
    original_processor.tokenizer.extra_special_tokens = {"image_token": "<|image|>", "audio_token": "<|audio|>"}
    # We need to add those temporarily to instantiate the processor
    original_processor.tokenizer.image_token = "<|image|>"
    original_processor.tokenizer.audio_token = "<|audio|>"
    original_processor.tokenizer.image_token_id = 200010
    original_processor.tokenizer.audio_token_id = 200011
    converted_processor = Phi4MultimodalProcessor(
        tokenizer=original_processor.tokenizer,
        image_processor=Phi4MultimodalImageProcessorFast(),
        audio_processor=Phi4MultimodalFeatureExtractor(),
        chat_template=CHAT_TEMPLATE,
    )
    # We remove them before saving to avoid polluting somehow
    del converted_processor.tokenizer.image_token
    del converted_processor.tokenizer.image_token_id
    del converted_processor.tokenizer.audio_token
    del converted_processor.tokenizer.audio_token_id
    # Save the processor
    converted_processor.save_pretrained(output_dir)
    # we need to rename a few tokens but tokenizers doesn't allow doing that programatically