Fix qwen2_5 get_rope_index tensor device locations (#37597)
* Fix qwen2_5 get_rope_index tensor device locations * simpler fix * edit right file for modular model * add a test * try normalizing type to fix non-video * fix some imports * add a video forward test with dummy input
This commit is contained in:
@@ -14,6 +14,7 @@
|
||||
"""Testing suite for the PyTorch Qwen2.5-VL model."""
|
||||
|
||||
import gc
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
import requests
|
||||
@@ -27,12 +28,14 @@ from transformers import (
|
||||
)
|
||||
from transformers.testing_utils import (
|
||||
is_flaky,
|
||||
require_cv2,
|
||||
require_flash_attn,
|
||||
require_torch,
|
||||
require_torch_gpu,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.utils import is_cv2_available
|
||||
|
||||
from ...generation.test_utils import GenerationTesterMixin
|
||||
from ...test_configuration_common import ConfigTester
|
||||
@@ -44,6 +47,9 @@ from ...test_modeling_common import (
|
||||
)
|
||||
|
||||
|
||||
if is_cv2_available():
|
||||
import cv2
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
@@ -262,6 +268,59 @@ class Qwen2_5_VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
|
||||
image_grid_thw=image_grid_thw,
|
||||
)
|
||||
|
||||
def test_video_forward(self):
|
||||
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
B = self.model_tester.batch_size
|
||||
C = config.vision_config.in_chans
|
||||
T = config.vision_config.temporal_patch_size
|
||||
P = config.vision_config.patch_size
|
||||
|
||||
input_ids = ids_tensor([B, self.model_tester.seq_length], self.model_tester.vocab_size)
|
||||
|
||||
F = 4
|
||||
patch_H = self.model_tester.image_size // P
|
||||
patch_W = self.model_tester.image_size // P
|
||||
patch_T = F // T
|
||||
patches_per_video = patch_T * patch_H * patch_W
|
||||
pixel_values_videos = floats_tensor(
|
||||
[
|
||||
# first dim: batch_size * num_patches
|
||||
B * patches_per_video,
|
||||
# second dim: in_channels * temporal_patch_size * patch_size^2
|
||||
C * T * (P**2),
|
||||
]
|
||||
)
|
||||
video_grid_thw = torch.tensor([[patch_T, patch_H, patch_W]] * B)
|
||||
|
||||
# sanity check
|
||||
assert pixel_values_videos.shape[0] == video_grid_thw.prod(dim=1).sum().item()
|
||||
|
||||
# Insert video token sequence
|
||||
input_ids[:, -1] = self.model_tester.pad_token_id
|
||||
input_ids[input_ids == self.model_tester.video_token_id] = self.model_tester.pad_token_id
|
||||
input_ids[input_ids == self.model_tester.image_token_id] = self.model_tester.pad_token_id
|
||||
input_ids[input_ids == self.model_tester.vision_start_token_id] = self.model_tester.pad_token_id
|
||||
input_ids[:, self.model_tester.num_image_tokens] = self.model_tester.video_token_id
|
||||
|
||||
insertion_point = self.model_tester.num_image_tokens
|
||||
|
||||
assert (B * patches_per_video) + insertion_point <= self.model_tester.seq_length
|
||||
for b in range(B):
|
||||
input_ids[b, insertion_point - 1] = self.model_tester.vision_start_token_id
|
||||
input_ids[b, insertion_point : insertion_point + patches_per_video] = self.model_tester.video_token_id
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
second_per_grid_ts = torch.tensor([1.0] * B, device=torch_device)
|
||||
model = model_class(config).to(torch_device)
|
||||
outputs = model(
|
||||
input_ids=input_ids,
|
||||
pixel_values_videos=pixel_values_videos,
|
||||
video_grid_thw=video_grid_thw,
|
||||
second_per_grid_ts=second_per_grid_ts,
|
||||
)
|
||||
self.assertIsNotNone(outputs)
|
||||
|
||||
@unittest.skip(reason="Feedforward chunking is not yet supported")
|
||||
def test_feed_forward_chunking(self):
|
||||
pass
|
||||
@@ -534,3 +593,52 @@ class Qwen2_5_VLIntegrationTest(unittest.TestCase):
|
||||
self.processor.batch_decode(output, skip_special_tokens=True),
|
||||
EXPECTED_DECODED_TEXT,
|
||||
)
|
||||
|
||||
@slow
|
||||
@require_cv2
|
||||
def test_small_model_integration_test_with_video(self):
|
||||
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
||||
"Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
|
||||
)
|
||||
|
||||
video_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4"
|
||||
messages2 = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "video",
|
||||
},
|
||||
{"type": "text", "text": "What is shown in this video?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
text = self.processor.apply_chat_template(messages2, tokenize=False, add_generation_prompt=True)
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".mp4") as f:
|
||||
f.write(requests.get(video_url).content)
|
||||
f.flush()
|
||||
cap = cv2.VideoCapture(f.name)
|
||||
|
||||
frames = []
|
||||
while True:
|
||||
ret, frame = cap.read()
|
||||
if not ret:
|
||||
break
|
||||
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
||||
frames.append(Image.fromarray(frame_rgb).resize((224, 224), Image.BICUBIC))
|
||||
|
||||
cap.release()
|
||||
|
||||
inputs = self.processor(text=[text], videos=[frames], return_tensors="pt").to(torch_device)
|
||||
|
||||
# it should not matter whether two images are the same size or not
|
||||
output = model.generate(**inputs, max_new_tokens=30)
|
||||
|
||||
EXPECTED_DECODED_TEXT = [
|
||||
'system\nYou are a helpful assistant.\nuser\nWhat is shown in this video?\nassistant\nThe video shows an indoor tennis court with a person standing on one side, preparing to serve the ball. The individual is dressed in athletic attire, including',
|
||||
] # fmt: skip
|
||||
self.assertEqual(
|
||||
self.processor.batch_decode(output, skip_special_tokens=True),
|
||||
EXPECTED_DECODED_TEXT,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user