From a5c6172c81d69a6fa2c3b1340d72fc669b941dcd Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Mon, 12 May 2025 12:14:04 +0200 Subject: [PATCH] [VLM] fix loading issues (#38051) * fix qwen2-vl loading * fix a few nore models * delete print * fix copies --- src/transformers/modeling_utils.py | 15 ++++++--------- .../models/qwen2_5_vl/modeling_qwen2_5_vl.py | 1 + .../models/qwen2_5_vl/modular_qwen2_5_vl.py | 1 + .../models/qwen2_vl/modeling_qwen2_vl.py | 1 + .../models/aya_vision/test_modeling_aya_vision.py | 8 -------- 5 files changed, 9 insertions(+), 17 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index cdaf68d761..f994d9b087 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -219,22 +219,19 @@ TORCH_INIT_FUNCTIONS = { # DO NOT MODIFY, KEPT FOR BC ONLY VLMS = [ "aria", - "aya_vision", + "ayavision", "emu3", "fuyu", - "got_ocr2", + "gotocr2", "gemma3", "internvl", - "llava", - "llava_next", - "llava_next_video", - "llava_onevision", + "llava", # all llava prefixed models fall under this check "mistral3", "mllama", "paligemma", - "qwen2_vl", - "qwem2_5_vl", - "video_llava", + "qwen2vl", + "qwen2_5_vl", + "videollava", "vipllava", ] diff --git a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index fa2aa76159..82ac330dd9 100644 --- a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -1381,6 +1381,7 @@ class Qwen2_5_VLTextModel(Qwen2_5_VLPreTrainedModel): @auto_docstring class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel): + base_model_prefix = "" _checkpoint_conversion_mapping = {"^model": "language_model"} config_class = Qwen2_5_VLConfig _no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"] diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py index 0adb966f4f..4f9882d223 100644 --- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py @@ -414,6 +414,7 @@ class Qwen2_5_VLModelOutputWithPast(Qwen2VLModelOutputWithPast): class Qwen2_5_VLModel(Qwen2VLModel): config_class = Qwen2_5_VLConfig + base_model_prefix = "" _no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"] def __init__(self, config): diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index ca9783cc4c..4e78a25915 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -1341,6 +1341,7 @@ class Qwen2VLTextModel(Qwen2VLPreTrainedModel): @auto_docstring class Qwen2VLModel(Qwen2VLPreTrainedModel): + base_model_prefix = "" _checkpoint_conversion_mapping = {"^model": "language_model"} def __init__(self, config: Qwen2VLConfig): diff --git a/tests/models/aya_vision/test_modeling_aya_vision.py b/tests/models/aya_vision/test_modeling_aya_vision.py index f4a464ed62..785ad723f4 100644 --- a/tests/models/aya_vision/test_modeling_aya_vision.py +++ b/tests/models/aya_vision/test_modeling_aya_vision.py @@ -144,7 +144,6 @@ class AyaVisionVisionText2TextModelTester: config, pixel_values = config_and_inputs input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device) - print("attention_mask", attention_mask.shape) # input_ids[:, -1] = self.pad_token_id input_ids[input_ids == self.image_token_index] = self.pad_token_id input_ids[:, : self.image_seq_length] = self.image_token_index @@ -366,7 +365,6 @@ class AyaVisionIntegrationTest(unittest.TestCase): output = model(**inputs) actual_logits = output.logits[0, -1, :5].cpu() - print("actual_logits", actual_logits) expected_logits = torch.tensor([0.4109, 0.1532, 0.8018, 2.1328, 0.5483], dtype=torch.float16) self.assertTrue( torch.allclose(actual_logits, expected_logits, atol=0.1), @@ -400,7 +398,6 @@ class AyaVisionIntegrationTest(unittest.TestCase): decoded_output = processor.decode( generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True ) - print("decoded_output", decoded_output) expected_outputs = Expectations( { @@ -437,7 +434,6 @@ class AyaVisionIntegrationTest(unittest.TestCase): decoded_output = processor.decode( generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True ) - print("decoded_output", decoded_output) expected_output = "The image depicts a cozy scene of two cats resting on a bright pink blanket. The cats," # fmt: skip self.assertEqual(decoded_output, expected_output) @@ -477,7 +473,6 @@ class AyaVisionIntegrationTest(unittest.TestCase): # Check first output decoded_output = processor.decode(output[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True) - print("decoded_output", decoded_output) expected_outputs = Expectations( { ("xpu", 3): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest lake.", @@ -494,7 +489,6 @@ class AyaVisionIntegrationTest(unittest.TestCase): # Check second output decoded_output = processor.decode(output[1, inputs["input_ids"].shape[1] :], skip_special_tokens=True) - print("decoded_output", decoded_output) expected_output = 'This image captures a vibrant street scene in a bustling urban area, likely in an Asian city. The focal point is a' # fmt: skip self.assertEqual( @@ -558,7 +552,6 @@ class AyaVisionIntegrationTest(unittest.TestCase): ) # fmt: skip expected_output = expected_outputs.get_expectation() - print("decoded_output", decoded_output) self.assertEqual( decoded_output, expected_output, @@ -567,7 +560,6 @@ class AyaVisionIntegrationTest(unittest.TestCase): # Check second output decoded_output = processor.decode(output[1, inputs["input_ids"].shape[1] :], skip_special_tokens=True) - print("decoded_output", decoded_output) expected_outputs = Expectations( { ("xpu", 3): "The first image showcases the Statue of Liberty, a colossal neoclassical sculpture on Liberty Island in New York Harbor. Standing at ",