Fix qwen2_5 get_rope_index tensor device locations (#37597)
* Fix qwen2_5 get_rope_index tensor device locations * simpler fix * edit right file for modular model * add a test * try normalizing type to fix non-video * fix some imports * add a video forward test with dummy input
This commit is contained in:
@@ -1663,6 +1663,11 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
|
|||||||
range_tensor = torch.arange(llm_grid_t).view(-1, 1)
|
range_tensor = torch.arange(llm_grid_t).view(-1, 1)
|
||||||
expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
|
expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
|
||||||
|
|
||||||
|
## normalize type, send to device.
|
||||||
|
second_per_grid_t = torch.as_tensor(
|
||||||
|
second_per_grid_t, dtype=range_tensor.dtype, device=range_tensor.device
|
||||||
|
)
|
||||||
|
|
||||||
time_tensor = expanded_range * second_per_grid_t * self.config.vision_config.tokens_per_second
|
time_tensor = expanded_range * second_per_grid_t * self.config.vision_config.tokens_per_second
|
||||||
|
|
||||||
time_tensor_long = time_tensor.long()
|
time_tensor_long = time_tensor.long()
|
||||||
|
|||||||
@@ -559,6 +559,11 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
|
|||||||
range_tensor = torch.arange(llm_grid_t).view(-1, 1)
|
range_tensor = torch.arange(llm_grid_t).view(-1, 1)
|
||||||
expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
|
expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
|
||||||
|
|
||||||
|
## normalize type, send to device.
|
||||||
|
second_per_grid_t = torch.as_tensor(
|
||||||
|
second_per_grid_t, dtype=range_tensor.dtype, device=range_tensor.device
|
||||||
|
)
|
||||||
|
|
||||||
time_tensor = expanded_range * second_per_grid_t * self.config.vision_config.tokens_per_second
|
time_tensor = expanded_range * second_per_grid_t * self.config.vision_config.tokens_per_second
|
||||||
|
|
||||||
time_tensor_long = time_tensor.long()
|
time_tensor_long = time_tensor.long()
|
||||||
|
|||||||
@@ -14,6 +14,7 @@
|
|||||||
"""Testing suite for the PyTorch Qwen2.5-VL model."""
|
"""Testing suite for the PyTorch Qwen2.5-VL model."""
|
||||||
|
|
||||||
import gc
|
import gc
|
||||||
|
import tempfile
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
@@ -27,12 +28,14 @@ from transformers import (
|
|||||||
)
|
)
|
||||||
from transformers.testing_utils import (
|
from transformers.testing_utils import (
|
||||||
is_flaky,
|
is_flaky,
|
||||||
|
require_cv2,
|
||||||
require_flash_attn,
|
require_flash_attn,
|
||||||
require_torch,
|
require_torch,
|
||||||
require_torch_gpu,
|
require_torch_gpu,
|
||||||
slow,
|
slow,
|
||||||
torch_device,
|
torch_device,
|
||||||
)
|
)
|
||||||
|
from transformers.utils import is_cv2_available
|
||||||
|
|
||||||
from ...generation.test_utils import GenerationTesterMixin
|
from ...generation.test_utils import GenerationTesterMixin
|
||||||
from ...test_configuration_common import ConfigTester
|
from ...test_configuration_common import ConfigTester
|
||||||
@@ -44,6 +47,9 @@ from ...test_modeling_common import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if is_cv2_available():
|
||||||
|
import cv2
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@@ -262,6 +268,59 @@ class Qwen2_5_VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
|
|||||||
image_grid_thw=image_grid_thw,
|
image_grid_thw=image_grid_thw,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_video_forward(self):
|
||||||
|
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|
||||||
|
B = self.model_tester.batch_size
|
||||||
|
C = config.vision_config.in_chans
|
||||||
|
T = config.vision_config.temporal_patch_size
|
||||||
|
P = config.vision_config.patch_size
|
||||||
|
|
||||||
|
input_ids = ids_tensor([B, self.model_tester.seq_length], self.model_tester.vocab_size)
|
||||||
|
|
||||||
|
F = 4
|
||||||
|
patch_H = self.model_tester.image_size // P
|
||||||
|
patch_W = self.model_tester.image_size // P
|
||||||
|
patch_T = F // T
|
||||||
|
patches_per_video = patch_T * patch_H * patch_W
|
||||||
|
pixel_values_videos = floats_tensor(
|
||||||
|
[
|
||||||
|
# first dim: batch_size * num_patches
|
||||||
|
B * patches_per_video,
|
||||||
|
# second dim: in_channels * temporal_patch_size * patch_size^2
|
||||||
|
C * T * (P**2),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
video_grid_thw = torch.tensor([[patch_T, patch_H, patch_W]] * B)
|
||||||
|
|
||||||
|
# sanity check
|
||||||
|
assert pixel_values_videos.shape[0] == video_grid_thw.prod(dim=1).sum().item()
|
||||||
|
|
||||||
|
# Insert video token sequence
|
||||||
|
input_ids[:, -1] = self.model_tester.pad_token_id
|
||||||
|
input_ids[input_ids == self.model_tester.video_token_id] = self.model_tester.pad_token_id
|
||||||
|
input_ids[input_ids == self.model_tester.image_token_id] = self.model_tester.pad_token_id
|
||||||
|
input_ids[input_ids == self.model_tester.vision_start_token_id] = self.model_tester.pad_token_id
|
||||||
|
input_ids[:, self.model_tester.num_image_tokens] = self.model_tester.video_token_id
|
||||||
|
|
||||||
|
insertion_point = self.model_tester.num_image_tokens
|
||||||
|
|
||||||
|
assert (B * patches_per_video) + insertion_point <= self.model_tester.seq_length
|
||||||
|
for b in range(B):
|
||||||
|
input_ids[b, insertion_point - 1] = self.model_tester.vision_start_token_id
|
||||||
|
input_ids[b, insertion_point : insertion_point + patches_per_video] = self.model_tester.video_token_id
|
||||||
|
|
||||||
|
for model_class in self.all_model_classes:
|
||||||
|
second_per_grid_ts = torch.tensor([1.0] * B, device=torch_device)
|
||||||
|
model = model_class(config).to(torch_device)
|
||||||
|
outputs = model(
|
||||||
|
input_ids=input_ids,
|
||||||
|
pixel_values_videos=pixel_values_videos,
|
||||||
|
video_grid_thw=video_grid_thw,
|
||||||
|
second_per_grid_ts=second_per_grid_ts,
|
||||||
|
)
|
||||||
|
self.assertIsNotNone(outputs)
|
||||||
|
|
||||||
@unittest.skip(reason="Feedforward chunking is not yet supported")
|
@unittest.skip(reason="Feedforward chunking is not yet supported")
|
||||||
def test_feed_forward_chunking(self):
|
def test_feed_forward_chunking(self):
|
||||||
pass
|
pass
|
||||||
@@ -534,3 +593,52 @@ class Qwen2_5_VLIntegrationTest(unittest.TestCase):
|
|||||||
self.processor.batch_decode(output, skip_special_tokens=True),
|
self.processor.batch_decode(output, skip_special_tokens=True),
|
||||||
EXPECTED_DECODED_TEXT,
|
EXPECTED_DECODED_TEXT,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
@require_cv2
|
||||||
|
def test_small_model_integration_test_with_video(self):
|
||||||
|
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
||||||
|
"Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
|
||||||
|
)
|
||||||
|
|
||||||
|
video_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4"
|
||||||
|
messages2 = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "video",
|
||||||
|
},
|
||||||
|
{"type": "text", "text": "What is shown in this video?"},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
text = self.processor.apply_chat_template(messages2, tokenize=False, add_generation_prompt=True)
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".mp4") as f:
|
||||||
|
f.write(requests.get(video_url).content)
|
||||||
|
f.flush()
|
||||||
|
cap = cv2.VideoCapture(f.name)
|
||||||
|
|
||||||
|
frames = []
|
||||||
|
while True:
|
||||||
|
ret, frame = cap.read()
|
||||||
|
if not ret:
|
||||||
|
break
|
||||||
|
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
||||||
|
frames.append(Image.fromarray(frame_rgb).resize((224, 224), Image.BICUBIC))
|
||||||
|
|
||||||
|
cap.release()
|
||||||
|
|
||||||
|
inputs = self.processor(text=[text], videos=[frames], return_tensors="pt").to(torch_device)
|
||||||
|
|
||||||
|
# it should not matter whether two images are the same size or not
|
||||||
|
output = model.generate(**inputs, max_new_tokens=30)
|
||||||
|
|
||||||
|
EXPECTED_DECODED_TEXT = [
|
||||||
|
'system\nYou are a helpful assistant.\nuser\nWhat is shown in this video?\nassistant\nThe video shows an indoor tennis court with a person standing on one side, preparing to serve the ball. The individual is dressed in athletic attire, including',
|
||||||
|
] # fmt: skip
|
||||||
|
self.assertEqual(
|
||||||
|
self.processor.batch_decode(output, skip_special_tokens=True),
|
||||||
|
EXPECTED_DECODED_TEXT,
|
||||||
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user