Fix qwen2_5 get_rope_index tensor device locations (#37597)

* Fix qwen2_5 get_rope_index tensor device locations * simpler fix * edit right file for modular model * add a test * try normalizing type to fix non-video * fix some imports * add a video forward test with dummy input
2025-04-24 09:04:38 -05:00
parent dd2649fa98
commit 43bb4c0456
3 changed files with 118 additions and 0 deletions
--- a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
@@ -1663,6 +1663,11 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
                    range_tensor = torch.arange(llm_grid_t).view(-1, 1)
                    expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
                    ## normalize type, send to device.
                    second_per_grid_t = torch.as_tensor(
                        second_per_grid_t, dtype=range_tensor.dtype, device=range_tensor.device
                    )
                    time_tensor = expanded_range * second_per_grid_t * self.config.vision_config.tokens_per_second
                    time_tensor_long = time_tensor.long()
--- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
@@ -559,6 +559,11 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
                    range_tensor = torch.arange(llm_grid_t).view(-1, 1)
                    expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
                    ## normalize type, send to device.
                    second_per_grid_t = torch.as_tensor(
                        second_per_grid_t, dtype=range_tensor.dtype, device=range_tensor.device
                    )
                    time_tensor = expanded_range * second_per_grid_t * self.config.vision_config.tokens_per_second
                    time_tensor_long = time_tensor.long()
--- a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
+++ b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
@@ -14,6 +14,7 @@
 """Testing suite for the PyTorch Qwen2.5-VL model."""
 import gc
 import tempfile
 import unittest
 import requests
@@ -27,12 +28,14 @@ from transformers import (
 )
 from transformers.testing_utils import (
    is_flaky,
    require_cv2,
    require_flash_attn,
    require_torch,
    require_torch_gpu,
    slow,
    torch_device,
 )
 from transformers.utils import is_cv2_available
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -44,6 +47,9 @@ from ...test_modeling_common import (
 )
 if is_cv2_available():
    import cv2
 if is_torch_available():
    import torch
@@ -262,6 +268,59 @@ class Qwen2_5_VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
                image_grid_thw=image_grid_thw,
            )
    def test_video_forward(self):
        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
        B = self.model_tester.batch_size
        C = config.vision_config.in_chans
        T = config.vision_config.temporal_patch_size
        P = config.vision_config.patch_size
        input_ids = ids_tensor([B, self.model_tester.seq_length], self.model_tester.vocab_size)
        F = 4
        patch_H = self.model_tester.image_size // P
        patch_W = self.model_tester.image_size // P
        patch_T = F // T
        patches_per_video = patch_T * patch_H * patch_W
        pixel_values_videos = floats_tensor(
            [
                # first dim: batch_size * num_patches
                B * patches_per_video,
                # second dim: in_channels * temporal_patch_size * patch_size^2
                C * T * (P**2),
            ]
        )
        video_grid_thw = torch.tensor([[patch_T, patch_H, patch_W]] * B)
        # sanity check
        assert pixel_values_videos.shape[0] == video_grid_thw.prod(dim=1).sum().item()
        # Insert video token sequence
        input_ids[:, -1] = self.model_tester.pad_token_id
        input_ids[input_ids == self.model_tester.video_token_id] = self.model_tester.pad_token_id
        input_ids[input_ids == self.model_tester.image_token_id] = self.model_tester.pad_token_id
        input_ids[input_ids == self.model_tester.vision_start_token_id] = self.model_tester.pad_token_id
        input_ids[:, self.model_tester.num_image_tokens] = self.model_tester.video_token_id
        insertion_point = self.model_tester.num_image_tokens
        assert (B * patches_per_video) + insertion_point <= self.model_tester.seq_length
        for b in range(B):
            input_ids[b, insertion_point - 1] = self.model_tester.vision_start_token_id
            input_ids[b, insertion_point : insertion_point + patches_per_video] = self.model_tester.video_token_id
        for model_class in self.all_model_classes:
            second_per_grid_ts = torch.tensor([1.0] * B, device=torch_device)
            model = model_class(config).to(torch_device)
            outputs = model(
                input_ids=input_ids,
                pixel_values_videos=pixel_values_videos,
                video_grid_thw=video_grid_thw,
                second_per_grid_ts=second_per_grid_ts,
            )
            self.assertIsNotNone(outputs)
    @unittest.skip(reason="Feedforward chunking is not yet supported")
    def test_feed_forward_chunking(self):
        pass
@@ -534,3 +593,52 @@ class Qwen2_5_VLIntegrationTest(unittest.TestCase):
            self.processor.batch_decode(output, skip_special_tokens=True),
            EXPECTED_DECODED_TEXT,
        )
    @slow
    @require_cv2
    def test_small_model_integration_test_with_video(self):
        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
            "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
        )
        video_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4"
        messages2 = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "video",
                    },
                    {"type": "text", "text": "What is shown in this video?"},
                ],
            }
        ]
        text = self.processor.apply_chat_template(messages2, tokenize=False, add_generation_prompt=True)
        with tempfile.NamedTemporaryFile(suffix=".mp4") as f:
            f.write(requests.get(video_url).content)
            f.flush()
            cap = cv2.VideoCapture(f.name)
            frames = []
            while True:
                ret, frame = cap.read()
                if not ret:
                    break
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frames.append(Image.fromarray(frame_rgb).resize((224, 224), Image.BICUBIC))
            cap.release()
        inputs = self.processor(text=[text], videos=[frames], return_tensors="pt").to(torch_device)
        # it should not matter whether two images are the same size or not
        output = model.generate(**inputs, max_new_tokens=30)
        EXPECTED_DECODED_TEXT = [
            'system\nYou are a helpful assistant.\nuser\nWhat is shown in this video?\nassistant\nThe video shows an indoor tennis court with a person standing on one side, preparing to serve the ball. The individual is dressed in athletic attire, including',
        ]  # fmt: skip
        self.assertEqual(
            self.processor.batch_decode(output, skip_special_tokens=True),
            EXPECTED_DECODED_TEXT,
        )