From a5c6172c81d69a6fa2c3b1340d72fc669b941dcd Mon Sep 17 00:00:00 2001
From: Raushan Turganbay <raushan@huggingface.co>
Date: Mon, 12 May 2025 12:14:04 +0200
Subject: [PATCH] [VLM] fix loading issues (#38051)

* fix qwen2-vl loading

* fix a few nore models

* delete print

* fix copies
---
 src/transformers/modeling_utils.py                | 15 ++++++---------
 .../models/qwen2_5_vl/modeling_qwen2_5_vl.py      |  1 +
 .../models/qwen2_5_vl/modular_qwen2_5_vl.py       |  1 +
 .../models/qwen2_vl/modeling_qwen2_vl.py          |  1 +
 .../models/aya_vision/test_modeling_aya_vision.py |  8 --------
 5 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index cdaf68d761..f994d9b087 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -219,22 +219,19 @@ TORCH_INIT_FUNCTIONS = {
 # DO NOT MODIFY, KEPT FOR BC ONLY
 VLMS = [
     "aria",
-    "aya_vision",
+    "ayavision",
     "emu3",
     "fuyu",
-    "got_ocr2",
+    "gotocr2",
     "gemma3",
     "internvl",
-    "llava",
-    "llava_next",
-    "llava_next_video",
-    "llava_onevision",
+    "llava",  # all llava prefixed models fall under this check
     "mistral3",
     "mllama",
     "paligemma",
-    "qwen2_vl",
-    "qwem2_5_vl",
-    "video_llava",
+    "qwen2vl",
+    "qwen2_5_vl",
+    "videollava",
     "vipllava",
 ]
 
diff --git a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
index fa2aa76159..82ac330dd9 100644
--- a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
@@ -1381,6 +1381,7 @@ class Qwen2_5_VLTextModel(Qwen2_5_VLPreTrainedModel):
 
 @auto_docstring
 class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
+    base_model_prefix = ""
     _checkpoint_conversion_mapping = {"^model": "language_model"}
     config_class = Qwen2_5_VLConfig
     _no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"]
diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
index 0adb966f4f..4f9882d223 100644
--- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
@@ -414,6 +414,7 @@ class Qwen2_5_VLModelOutputWithPast(Qwen2VLModelOutputWithPast):
 
 class Qwen2_5_VLModel(Qwen2VLModel):
     config_class = Qwen2_5_VLConfig
+    base_model_prefix = ""
     _no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
index ca9783cc4c..4e78a25915 100644
--- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
@@ -1341,6 +1341,7 @@ class Qwen2VLTextModel(Qwen2VLPreTrainedModel):
 
 @auto_docstring
 class Qwen2VLModel(Qwen2VLPreTrainedModel):
+    base_model_prefix = ""
     _checkpoint_conversion_mapping = {"^model": "language_model"}
 
     def __init__(self, config: Qwen2VLConfig):
diff --git a/tests/models/aya_vision/test_modeling_aya_vision.py b/tests/models/aya_vision/test_modeling_aya_vision.py
index f4a464ed62..785ad723f4 100644
--- a/tests/models/aya_vision/test_modeling_aya_vision.py
+++ b/tests/models/aya_vision/test_modeling_aya_vision.py
@@ -144,7 +144,6 @@ class AyaVisionVisionText2TextModelTester:
         config, pixel_values = config_and_inputs
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
         attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
-        print("attention_mask", attention_mask.shape)
         # input_ids[:, -1] = self.pad_token_id
         input_ids[input_ids == self.image_token_index] = self.pad_token_id
         input_ids[:, : self.image_seq_length] = self.image_token_index
@@ -366,7 +365,6 @@ class AyaVisionIntegrationTest(unittest.TestCase):
             output = model(**inputs)
 
         actual_logits = output.logits[0, -1, :5].cpu()
-        print("actual_logits", actual_logits)
         expected_logits = torch.tensor([0.4109, 0.1532, 0.8018, 2.1328, 0.5483], dtype=torch.float16)
         self.assertTrue(
             torch.allclose(actual_logits, expected_logits, atol=0.1),
@@ -400,7 +398,6 @@ class AyaVisionIntegrationTest(unittest.TestCase):
             decoded_output = processor.decode(
                 generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
             )
-        print("decoded_output", decoded_output)
 
         expected_outputs = Expectations(
             {
@@ -437,7 +434,6 @@ class AyaVisionIntegrationTest(unittest.TestCase):
             decoded_output = processor.decode(
                 generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
             )
-        print("decoded_output", decoded_output)
         expected_output = "The image depicts a cozy scene of two cats resting on a bright pink blanket. The cats,"  # fmt: skip
         self.assertEqual(decoded_output, expected_output)
 
@@ -477,7 +473,6 @@ class AyaVisionIntegrationTest(unittest.TestCase):
 
         # Check first output
         decoded_output = processor.decode(output[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
-        print("decoded_output", decoded_output)
         expected_outputs = Expectations(
             {
                 ("xpu", 3): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest lake.",
@@ -494,7 +489,6 @@ class AyaVisionIntegrationTest(unittest.TestCase):
 
         # Check second output
         decoded_output = processor.decode(output[1, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
-        print("decoded_output", decoded_output)
         expected_output = 'This image captures a vibrant street scene in a bustling urban area, likely in an Asian city. The focal point is a'  # fmt: skip
 
         self.assertEqual(
@@ -558,7 +552,6 @@ class AyaVisionIntegrationTest(unittest.TestCase):
         )  # fmt: skip
         expected_output = expected_outputs.get_expectation()
 
-        print("decoded_output", decoded_output)
         self.assertEqual(
             decoded_output,
             expected_output,
@@ -567,7 +560,6 @@ class AyaVisionIntegrationTest(unittest.TestCase):
 
         # Check second output
         decoded_output = processor.decode(output[1, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
-        print("decoded_output", decoded_output)
         expected_outputs = Expectations(
             {
                 ("xpu", 3): "The first image showcases the Statue of Liberty, a colossal neoclassical sculpture on Liberty Island in New York Harbor. Standing at ",