From 43bb4c0456ebab67ca6b11fa5fa4c099fb2e6a2c Mon Sep 17 00:00:00 2001
From: robert <rphmeier@gmail.com>
Date: Thu, 24 Apr 2025 09:04:38 -0500
Subject: [PATCH] Fix qwen2_5 get_rope_index tensor device locations (#37597)

* Fix qwen2_5 get_rope_index tensor device locations

* simpler fix

* edit right file for modular model

* add a test

* try normalizing type to fix non-video

* fix some imports

* add a video forward test with dummy input
---
 .../models/qwen2_5_vl/modeling_qwen2_5_vl.py  |   5 +
 .../models/qwen2_5_vl/modular_qwen2_5_vl.py   |   5 +
 .../qwen2_5_vl/test_modeling_qwen2_5_vl.py    | 108 ++++++++++++++++++
 3 files changed, 118 insertions(+)

diff --git a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
index 8155a0d280..4da0f59bf4 100644
--- a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
@@ -1663,6 +1663,11 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
                     range_tensor = torch.arange(llm_grid_t).view(-1, 1)
                     expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
 
+                    ## normalize type, send to device.
+                    second_per_grid_t = torch.as_tensor(
+                        second_per_grid_t, dtype=range_tensor.dtype, device=range_tensor.device
+                    )
+
                     time_tensor = expanded_range * second_per_grid_t * self.config.vision_config.tokens_per_second
 
                     time_tensor_long = time_tensor.long()
diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
index f34c48bb54..e34724c790 100644
--- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
@@ -559,6 +559,11 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
                     range_tensor = torch.arange(llm_grid_t).view(-1, 1)
                     expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
 
+                    ## normalize type, send to device.
+                    second_per_grid_t = torch.as_tensor(
+                        second_per_grid_t, dtype=range_tensor.dtype, device=range_tensor.device
+                    )
+
                     time_tensor = expanded_range * second_per_grid_t * self.config.vision_config.tokens_per_second
 
                     time_tensor_long = time_tensor.long()
diff --git a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
index a0579ce202..21947dca35 100644
--- a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
+++ b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
@@ -14,6 +14,7 @@
 """Testing suite for the PyTorch Qwen2.5-VL model."""
 
 import gc
+import tempfile
 import unittest
 
 import requests
@@ -27,12 +28,14 @@ from transformers import (
 )
 from transformers.testing_utils import (
     is_flaky,
+    require_cv2,
     require_flash_attn,
     require_torch,
     require_torch_gpu,
     slow,
     torch_device,
 )
+from transformers.utils import is_cv2_available
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -44,6 +47,9 @@ from ...test_modeling_common import (
 )
 
 
+if is_cv2_available():
+    import cv2
+
 if is_torch_available():
     import torch
 
@@ -262,6 +268,59 @@ class Qwen2_5_VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
                 image_grid_thw=image_grid_thw,
             )
 
+    def test_video_forward(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        B = self.model_tester.batch_size
+        C = config.vision_config.in_chans
+        T = config.vision_config.temporal_patch_size
+        P = config.vision_config.patch_size
+
+        input_ids = ids_tensor([B, self.model_tester.seq_length], self.model_tester.vocab_size)
+
+        F = 4
+        patch_H = self.model_tester.image_size // P
+        patch_W = self.model_tester.image_size // P
+        patch_T = F // T
+        patches_per_video = patch_T * patch_H * patch_W
+        pixel_values_videos = floats_tensor(
+            [
+                # first dim: batch_size * num_patches
+                B * patches_per_video,
+                # second dim: in_channels * temporal_patch_size * patch_size^2
+                C * T * (P**2),
+            ]
+        )
+        video_grid_thw = torch.tensor([[patch_T, patch_H, patch_W]] * B)
+
+        # sanity check
+        assert pixel_values_videos.shape[0] == video_grid_thw.prod(dim=1).sum().item()
+
+        # Insert video token sequence
+        input_ids[:, -1] = self.model_tester.pad_token_id
+        input_ids[input_ids == self.model_tester.video_token_id] = self.model_tester.pad_token_id
+        input_ids[input_ids == self.model_tester.image_token_id] = self.model_tester.pad_token_id
+        input_ids[input_ids == self.model_tester.vision_start_token_id] = self.model_tester.pad_token_id
+        input_ids[:, self.model_tester.num_image_tokens] = self.model_tester.video_token_id
+
+        insertion_point = self.model_tester.num_image_tokens
+
+        assert (B * patches_per_video) + insertion_point <= self.model_tester.seq_length
+        for b in range(B):
+            input_ids[b, insertion_point - 1] = self.model_tester.vision_start_token_id
+            input_ids[b, insertion_point : insertion_point + patches_per_video] = self.model_tester.video_token_id
+
+        for model_class in self.all_model_classes:
+            second_per_grid_ts = torch.tensor([1.0] * B, device=torch_device)
+            model = model_class(config).to(torch_device)
+            outputs = model(
+                input_ids=input_ids,
+                pixel_values_videos=pixel_values_videos,
+                video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
+            )
+            self.assertIsNotNone(outputs)
+
     @unittest.skip(reason="Feedforward chunking is not yet supported")
     def test_feed_forward_chunking(self):
         pass
@@ -534,3 +593,52 @@ class Qwen2_5_VLIntegrationTest(unittest.TestCase):
             self.processor.batch_decode(output, skip_special_tokens=True),
             EXPECTED_DECODED_TEXT,
         )
+
+    @slow
+    @require_cv2
+    def test_small_model_integration_test_with_video(self):
+        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
+        )
+
+        video_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4"
+        messages2 = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "video",
+                    },
+                    {"type": "text", "text": "What is shown in this video?"},
+                ],
+            }
+        ]
+        text = self.processor.apply_chat_template(messages2, tokenize=False, add_generation_prompt=True)
+
+        with tempfile.NamedTemporaryFile(suffix=".mp4") as f:
+            f.write(requests.get(video_url).content)
+            f.flush()
+            cap = cv2.VideoCapture(f.name)
+
+            frames = []
+            while True:
+                ret, frame = cap.read()
+                if not ret:
+                    break
+                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                frames.append(Image.fromarray(frame_rgb).resize((224, 224), Image.BICUBIC))
+
+            cap.release()
+
+        inputs = self.processor(text=[text], videos=[frames], return_tensors="pt").to(torch_device)
+
+        # it should not matter whether two images are the same size or not
+        output = model.generate(**inputs, max_new_tokens=30)
+
+        EXPECTED_DECODED_TEXT = [
+            'system\nYou are a helpful assistant.\nuser\nWhat is shown in this video?\nassistant\nThe video shows an indoor tennis court with a person standing on one side, preparing to serve the ball. The individual is dressed in athletic attire, including',
+        ]  # fmt: skip
+        self.assertEqual(
+            self.processor.batch_decode(output, skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT,
+        )