diff --git a/docs/source/en/model_doc/internvl.md b/docs/source/en/model_doc/internvl.md
index fa03a0e70d..4ac56c8537 100644
--- a/docs/source/en/model_doc/internvl.md
+++ b/docs/source/en/model_doc/internvl.md
@@ -257,6 +257,7 @@ InternVL models can also handle video inputs. Here is an example of how to perfo
... add_generation_prompt=True,
... tokenize=True,
... return_dict=True,
+... num_frames=8,
>>> ).to(model.device, dtype=torch.float16)
>>> output = model.generate(**inputs, max_new_tokens=25)
diff --git a/src/transformers/models/internvl/convert_internvl_weights_to_hf.py b/src/transformers/models/internvl/convert_internvl_weights_to_hf.py
index 52a7b464f9..539085c471 100644
--- a/src/transformers/models/internvl/convert_internvl_weights_to_hf.py
+++ b/src/transformers/models/internvl/convert_internvl_weights_to_hf.py
@@ -312,6 +312,7 @@ def write_tokenizer(save_dir: str, push_to_hub: bool = False, path: str = None,
"start_image_token": "
",
"end_image_token": "",
"context_image_token": "",
+ "video_token": "