From 43bb4c0456ebab67ca6b11fa5fa4c099fb2e6a2c Mon Sep 17 00:00:00 2001 From: robert Date: Thu, 24 Apr 2025 09:04:38 -0500 Subject: [PATCH] Fix qwen2_5 get_rope_index tensor device locations (#37597) * Fix qwen2_5 get_rope_index tensor device locations * simpler fix * edit right file for modular model * add a test * try normalizing type to fix non-video * fix some imports * add a video forward test with dummy input --- .../models/qwen2_5_vl/modeling_qwen2_5_vl.py | 5 + .../models/qwen2_5_vl/modular_qwen2_5_vl.py | 5 + .../qwen2_5_vl/test_modeling_qwen2_5_vl.py | 108 ++++++++++++++++++ 3 files changed, 118 insertions(+) diff --git a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index 8155a0d280..4da0f59bf4 100644 --- a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -1663,6 +1663,11 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi range_tensor = torch.arange(llm_grid_t).view(-1, 1) expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w) + ## normalize type, send to device. + second_per_grid_t = torch.as_tensor( + second_per_grid_t, dtype=range_tensor.dtype, device=range_tensor.device + ) + time_tensor = expanded_range * second_per_grid_t * self.config.vision_config.tokens_per_second time_tensor_long = time_tensor.long() diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py index f34c48bb54..e34724c790 100644 --- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py @@ -559,6 +559,11 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration): range_tensor = torch.arange(llm_grid_t).view(-1, 1) expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w) + ## normalize type, send to device. + second_per_grid_t = torch.as_tensor( + second_per_grid_t, dtype=range_tensor.dtype, device=range_tensor.device + ) + time_tensor = expanded_range * second_per_grid_t * self.config.vision_config.tokens_per_second time_tensor_long = time_tensor.long() diff --git a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py index a0579ce202..21947dca35 100644 --- a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py +++ b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py @@ -14,6 +14,7 @@ """Testing suite for the PyTorch Qwen2.5-VL model.""" import gc +import tempfile import unittest import requests @@ -27,12 +28,14 @@ from transformers import ( ) from transformers.testing_utils import ( is_flaky, + require_cv2, require_flash_attn, require_torch, require_torch_gpu, slow, torch_device, ) +from transformers.utils import is_cv2_available from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -44,6 +47,9 @@ from ...test_modeling_common import ( ) +if is_cv2_available(): + import cv2 + if is_torch_available(): import torch @@ -262,6 +268,59 @@ class Qwen2_5_VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test image_grid_thw=image_grid_thw, ) + def test_video_forward(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + B = self.model_tester.batch_size + C = config.vision_config.in_chans + T = config.vision_config.temporal_patch_size + P = config.vision_config.patch_size + + input_ids = ids_tensor([B, self.model_tester.seq_length], self.model_tester.vocab_size) + + F = 4 + patch_H = self.model_tester.image_size // P + patch_W = self.model_tester.image_size // P + patch_T = F // T + patches_per_video = patch_T * patch_H * patch_W + pixel_values_videos = floats_tensor( + [ + # first dim: batch_size * num_patches + B * patches_per_video, + # second dim: in_channels * temporal_patch_size * patch_size^2 + C * T * (P**2), + ] + ) + video_grid_thw = torch.tensor([[patch_T, patch_H, patch_W]] * B) + + # sanity check + assert pixel_values_videos.shape[0] == video_grid_thw.prod(dim=1).sum().item() + + # Insert video token sequence + input_ids[:, -1] = self.model_tester.pad_token_id + input_ids[input_ids == self.model_tester.video_token_id] = self.model_tester.pad_token_id + input_ids[input_ids == self.model_tester.image_token_id] = self.model_tester.pad_token_id + input_ids[input_ids == self.model_tester.vision_start_token_id] = self.model_tester.pad_token_id + input_ids[:, self.model_tester.num_image_tokens] = self.model_tester.video_token_id + + insertion_point = self.model_tester.num_image_tokens + + assert (B * patches_per_video) + insertion_point <= self.model_tester.seq_length + for b in range(B): + input_ids[b, insertion_point - 1] = self.model_tester.vision_start_token_id + input_ids[b, insertion_point : insertion_point + patches_per_video] = self.model_tester.video_token_id + + for model_class in self.all_model_classes: + second_per_grid_ts = torch.tensor([1.0] * B, device=torch_device) + model = model_class(config).to(torch_device) + outputs = model( + input_ids=input_ids, + pixel_values_videos=pixel_values_videos, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + ) + self.assertIsNotNone(outputs) + @unittest.skip(reason="Feedforward chunking is not yet supported") def test_feed_forward_chunking(self): pass @@ -534,3 +593,52 @@ class Qwen2_5_VLIntegrationTest(unittest.TestCase): self.processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT, ) + + @slow + @require_cv2 + def test_small_model_integration_test_with_video(self): + model = Qwen2_5_VLForConditionalGeneration.from_pretrained( + "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto" + ) + + video_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4" + messages2 = [ + { + "role": "user", + "content": [ + { + "type": "video", + }, + {"type": "text", "text": "What is shown in this video?"}, + ], + } + ] + text = self.processor.apply_chat_template(messages2, tokenize=False, add_generation_prompt=True) + + with tempfile.NamedTemporaryFile(suffix=".mp4") as f: + f.write(requests.get(video_url).content) + f.flush() + cap = cv2.VideoCapture(f.name) + + frames = [] + while True: + ret, frame = cap.read() + if not ret: + break + frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + frames.append(Image.fromarray(frame_rgb).resize((224, 224), Image.BICUBIC)) + + cap.release() + + inputs = self.processor(text=[text], videos=[frames], return_tensors="pt").to(torch_device) + + # it should not matter whether two images are the same size or not + output = model.generate(**inputs, max_new_tokens=30) + + EXPECTED_DECODED_TEXT = [ + 'system\nYou are a helpful assistant.\nuser\nWhat is shown in this video?\nassistant\nThe video shows an indoor tennis court with a person standing on one side, preparing to serve the ball. The individual is dressed in athletic attire, including', + ] # fmt: skip + self.assertEqual( + self.processor.batch_decode(output, skip_special_tokens=True), + EXPECTED_DECODED_TEXT, + )