diff --git a/docs/source/en/model_doc/internvl.md b/docs/source/en/model_doc/internvl.md index fa03a0e70d..4ac56c8537 100644 --- a/docs/source/en/model_doc/internvl.md +++ b/docs/source/en/model_doc/internvl.md @@ -257,6 +257,7 @@ InternVL models can also handle video inputs. Here is an example of how to perfo ... add_generation_prompt=True, ... tokenize=True, ... return_dict=True, +... num_frames=8, >>> ).to(model.device, dtype=torch.float16) >>> output = model.generate(**inputs, max_new_tokens=25) diff --git a/src/transformers/models/internvl/convert_internvl_weights_to_hf.py b/src/transformers/models/internvl/convert_internvl_weights_to_hf.py index 52a7b464f9..539085c471 100644 --- a/src/transformers/models/internvl/convert_internvl_weights_to_hf.py +++ b/src/transformers/models/internvl/convert_internvl_weights_to_hf.py @@ -312,6 +312,7 @@ def write_tokenizer(save_dir: str, push_to_hub: bool = False, path: str = None, "start_image_token": "", "end_image_token": "", "context_image_token": "", + "video_token": "