From 1e9087368c14cef9d759f3ebfc6602b9350f345d Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Wed, 23 Apr 2025 16:56:36 +0200 Subject: [PATCH] [internvl] fix chat template (#37656) * fix chat template * update * update conversion * rename `fake_image_token` in tests --- docs/source/en/model_doc/internvl.md | 1 + .../convert_internvl_weights_to_hf.py | 1 + .../models/internvl/processing_internvl.py | 136 +++++++----------- .../models/internvl/test_modeling_internvl.py | 36 +++-- .../internvl/test_processor_internvl.py | 34 +---- 5 files changed, 88 insertions(+), 120 deletions(-) diff --git a/docs/source/en/model_doc/internvl.md b/docs/source/en/model_doc/internvl.md index fa03a0e70d..4ac56c8537 100644 --- a/docs/source/en/model_doc/internvl.md +++ b/docs/source/en/model_doc/internvl.md @@ -257,6 +257,7 @@ InternVL models can also handle video inputs. Here is an example of how to perfo ... add_generation_prompt=True, ... tokenize=True, ... return_dict=True, +... num_frames=8, >>> ).to(model.device, dtype=torch.float16) >>> output = model.generate(**inputs, max_new_tokens=25) diff --git a/src/transformers/models/internvl/convert_internvl_weights_to_hf.py b/src/transformers/models/internvl/convert_internvl_weights_to_hf.py index 52a7b464f9..539085c471 100644 --- a/src/transformers/models/internvl/convert_internvl_weights_to_hf.py +++ b/src/transformers/models/internvl/convert_internvl_weights_to_hf.py @@ -312,6 +312,7 @@ def write_tokenizer(save_dir: str, push_to_hub: bool = False, path: str = None, "start_image_token": "", "end_image_token": "", "context_image_token": "", + "video_token": "