[docs] add the missing import for Image and bug fix (#34776)
* add the missing import for Image lib * add more devices in comment * bug fix
This commit is contained in:
@@ -47,7 +47,7 @@ model_id = "llava-hf/llava-interleave-qwen-0.5b-hf"
|
|||||||
processor = LlavaProcessor.from_pretrained(model_id)
|
processor = LlavaProcessor.from_pretrained(model_id)
|
||||||
|
|
||||||
model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16)
|
model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16)
|
||||||
model.to("cuda")
|
model.to("cuda") # can also be xpu, mps, npu etc. depending on your hardware accelerator
|
||||||
```
|
```
|
||||||
|
|
||||||
Some models directly consume the `<video>` token, and others accept `<image>` tokens equal to the number of sampled frames. This model handles videos in the latter fashion. We will write a simple utility to handle image tokens, and another utility to get a video from a url and sample frames from it.
|
Some models directly consume the `<video>` token, and others accept `<image>` tokens equal to the number of sampled frames. This model handles videos in the latter fashion. We will write a simple utility to handle image tokens, and another utility to get a video from a url and sample frames from it.
|
||||||
@@ -56,6 +56,7 @@ Some models directly consume the `<video>` token, and others accept `<image>` to
|
|||||||
import uuid
|
import uuid
|
||||||
import requests
|
import requests
|
||||||
import cv2
|
import cv2
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
def replace_video_with_images(text, frames):
|
def replace_video_with_images(text, frames):
|
||||||
return text.replace("<video>", "<image>" * frames)
|
return text.replace("<video>", "<image>" * frames)
|
||||||
@@ -82,7 +83,7 @@ def sample_frames(url, num_frames):
|
|||||||
if i % interval == 0:
|
if i % interval == 0:
|
||||||
frames.append(pil_img)
|
frames.append(pil_img)
|
||||||
video.release()
|
video.release()
|
||||||
return frames
|
return frames[:num_frames]
|
||||||
```
|
```
|
||||||
|
|
||||||
Let's get our inputs. We will sample frames and concatenate them.
|
Let's get our inputs. We will sample frames and concatenate them.
|
||||||
@@ -127,7 +128,7 @@ This model has a prompt template that looks like following. First, we'll put all
|
|||||||
user_prompt = "Are these two cats in these two videos doing the same thing?"
|
user_prompt = "Are these two cats in these two videos doing the same thing?"
|
||||||
toks = "<image>" * 12
|
toks = "<image>" * 12
|
||||||
prompt = "<|im_start|>user"+ toks + f"\n{user_prompt}<|im_end|><|im_start|>assistant"
|
prompt = "<|im_start|>user"+ toks + f"\n{user_prompt}<|im_end|><|im_start|>assistant"
|
||||||
inputs = processor(prompt, images=videos).to(model.device, model.dtype)
|
inputs = processor(text=prompt, images=videos, return_tensors="pt").to(model.device, model.dtype)
|
||||||
```
|
```
|
||||||
|
|
||||||
We can now call [`~GenerationMixin.generate`] for inference. The model outputs the question in our input and answer, so we only take the text after the prompt and `assistant` part from the model output.
|
We can now call [`~GenerationMixin.generate`] for inference. The model outputs the question in our input and answer, so we only take the text after the prompt and `assistant` part from the model output.
|
||||||
|
|||||||
Reference in New Issue
Block a user