Fix qwen2_5 get_rope_index tensor device locations (#37597)

* Fix qwen2_5 get_rope_index tensor device locations

* simpler fix

* edit right file for modular model

* add a test

* try normalizing type to fix non-video

* fix some imports

* add a video forward test with dummy input
This commit is contained in:
robert
2025-04-24 09:04:38 -05:00
committed by GitHub
parent dd2649fa98
commit 43bb4c0456
3 changed files with 118 additions and 0 deletions

View File

@@ -1663,6 +1663,11 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
range_tensor = torch.arange(llm_grid_t).view(-1, 1)
expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
## normalize type, send to device.
second_per_grid_t = torch.as_tensor(
second_per_grid_t, dtype=range_tensor.dtype, device=range_tensor.device
)
time_tensor = expanded_range * second_per_grid_t * self.config.vision_config.tokens_per_second
time_tensor_long = time_tensor.long()

View File

@@ -559,6 +559,11 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
range_tensor = torch.arange(llm_grid_t).view(-1, 1)
expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
## normalize type, send to device.
second_per_grid_t = torch.as_tensor(
second_per_grid_t, dtype=range_tensor.dtype, device=range_tensor.device
)
time_tensor = expanded_range * second_per_grid_t * self.config.vision_config.tokens_per_second
time_tensor_long = time_tensor.long()

View File

@@ -14,6 +14,7 @@
"""Testing suite for the PyTorch Qwen2.5-VL model."""
import gc
import tempfile
import unittest
import requests
@@ -27,12 +28,14 @@ from transformers import (
)
from transformers.testing_utils import (
is_flaky,
require_cv2,
require_flash_attn,
require_torch,
require_torch_gpu,
slow,
torch_device,
)
from transformers.utils import is_cv2_available
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
@@ -44,6 +47,9 @@ from ...test_modeling_common import (
)
if is_cv2_available():
import cv2
if is_torch_available():
import torch
@@ -262,6 +268,59 @@ class Qwen2_5_VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
image_grid_thw=image_grid_thw,
)
def test_video_forward(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
B = self.model_tester.batch_size
C = config.vision_config.in_chans
T = config.vision_config.temporal_patch_size
P = config.vision_config.patch_size
input_ids = ids_tensor([B, self.model_tester.seq_length], self.model_tester.vocab_size)
F = 4
patch_H = self.model_tester.image_size // P
patch_W = self.model_tester.image_size // P
patch_T = F // T
patches_per_video = patch_T * patch_H * patch_W
pixel_values_videos = floats_tensor(
[
# first dim: batch_size * num_patches
B * patches_per_video,
# second dim: in_channels * temporal_patch_size * patch_size^2
C * T * (P**2),
]
)
video_grid_thw = torch.tensor([[patch_T, patch_H, patch_W]] * B)
# sanity check
assert pixel_values_videos.shape[0] == video_grid_thw.prod(dim=1).sum().item()
# Insert video token sequence
input_ids[:, -1] = self.model_tester.pad_token_id
input_ids[input_ids == self.model_tester.video_token_id] = self.model_tester.pad_token_id
input_ids[input_ids == self.model_tester.image_token_id] = self.model_tester.pad_token_id
input_ids[input_ids == self.model_tester.vision_start_token_id] = self.model_tester.pad_token_id
input_ids[:, self.model_tester.num_image_tokens] = self.model_tester.video_token_id
insertion_point = self.model_tester.num_image_tokens
assert (B * patches_per_video) + insertion_point <= self.model_tester.seq_length
for b in range(B):
input_ids[b, insertion_point - 1] = self.model_tester.vision_start_token_id
input_ids[b, insertion_point : insertion_point + patches_per_video] = self.model_tester.video_token_id
for model_class in self.all_model_classes:
second_per_grid_ts = torch.tensor([1.0] * B, device=torch_device)
model = model_class(config).to(torch_device)
outputs = model(
input_ids=input_ids,
pixel_values_videos=pixel_values_videos,
video_grid_thw=video_grid_thw,
second_per_grid_ts=second_per_grid_ts,
)
self.assertIsNotNone(outputs)
@unittest.skip(reason="Feedforward chunking is not yet supported")
def test_feed_forward_chunking(self):
pass
@@ -534,3 +593,52 @@ class Qwen2_5_VLIntegrationTest(unittest.TestCase):
self.processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
@slow
@require_cv2
def test_small_model_integration_test_with_video(self):
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)
video_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4"
messages2 = [
{
"role": "user",
"content": [
{
"type": "video",
},
{"type": "text", "text": "What is shown in this video?"},
],
}
]
text = self.processor.apply_chat_template(messages2, tokenize=False, add_generation_prompt=True)
with tempfile.NamedTemporaryFile(suffix=".mp4") as f:
f.write(requests.get(video_url).content)
f.flush()
cap = cv2.VideoCapture(f.name)
frames = []
while True:
ret, frame = cap.read()
if not ret:
break
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frames.append(Image.fromarray(frame_rgb).resize((224, 224), Image.BICUBIC))
cap.release()
inputs = self.processor(text=[text], videos=[frames], return_tensors="pt").to(torch_device)
# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=30)
EXPECTED_DECODED_TEXT = [
'system\nYou are a helpful assistant.\nuser\nWhat is shown in this video?\nassistant\nThe video shows an indoor tennis court with a person standing on one side, preparing to serve the ball. The individual is dressed in athletic attire, including',
] # fmt: skip
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)