From 934e1b84e956ea74323b92eb86300126b8139a42 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Wed, 22 May 2024 16:56:41 +0500 Subject: [PATCH] Update video-llava docs (#30935) * update video-llava * Update docs/source/en/model_doc/video_llava.md Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- docs/source/en/model_doc/video_llava.md | 96 ++++++++++++++++--- .../video_llava/modeling_video_llava.py | 14 +-- 2 files changed, 91 insertions(+), 19 deletions(-) diff --git a/docs/source/en/model_doc/video_llava.md b/docs/source/en/model_doc/video_llava.md index 7f27e65a4e..307c55bb2c 100644 --- a/docs/source/en/model_doc/video_llava.md +++ b/docs/source/en/model_doc/video_llava.md @@ -42,21 +42,28 @@ a unified visual representation, outperforming models designed specifically for work to provide modest insights into the multi-modal inputs for the LLM* -Tips: +## Usage tips: - We advise users to use padding_side="left" when computing batched generation as it leads to more accurate results. Simply make sure to call processor.tokenizer.padding_side = "left" before generating. - Note the model has not been explicitly trained to process multiple images/videos in the same prompt, although this is technically possible, you may experience inaccurate results. -- For better results, we recommend users prompt the model with the correct prompt format: +- Note that the video inputs should have exactly 8 frames at the input, since the models were trained in that setting. +This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanTurganbay). +The original code can be found [here](https://github.com/PKU-YuanGroup/Video-LLaVA). + + +## Usage example + +### Single Media Mode + +The model can accept both images and videos as input. Here's an example code for inference in half-precision (`torch.float16`): ```python import av import torch import numpy as np -import requests -from PIL import Image from transformers import VideoLlavaForConditionalGeneration, VideoLlavaProcessor def read_video_pyav(container, indices): @@ -79,36 +86,99 @@ def read_video_pyav(container, indices): frames.append(frame) return np.stack([x.to_ndarray(format="rgb24") for x in frames]) - -model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", device_map="auto") +# Load the model in half-precision +model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", torch_dtype=torch.float16, device_map="auto") processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf") +# Load the video as an np.arrau, sampling uniformly 8 frames video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset") - container = av.open(video_path) total_frames = container.streams.video[0].frames indices = np.arange(0, total_frames, total_frames / 8).astype(int) video = read_video_pyav(container, indices) +# For better results, we recommend to prompt the model in the following format prompt = "USER: