[VLM] fix loading issues (#38051)

* fix qwen2-vl loading

* fix a few nore models

* delete print

* fix copies
This commit is contained in:
Raushan Turganbay
2025-05-12 12:14:04 +02:00
committed by GitHub
parent a31fa218ad
commit a5c6172c81
5 changed files with 9 additions and 17 deletions

View File

@@ -219,22 +219,19 @@ TORCH_INIT_FUNCTIONS = {
# DO NOT MODIFY, KEPT FOR BC ONLY # DO NOT MODIFY, KEPT FOR BC ONLY
VLMS = [ VLMS = [
"aria", "aria",
"aya_vision", "ayavision",
"emu3", "emu3",
"fuyu", "fuyu",
"got_ocr2", "gotocr2",
"gemma3", "gemma3",
"internvl", "internvl",
"llava", "llava", # all llava prefixed models fall under this check
"llava_next",
"llava_next_video",
"llava_onevision",
"mistral3", "mistral3",
"mllama", "mllama",
"paligemma", "paligemma",
"qwen2_vl", "qwen2vl",
"qwem2_5_vl", "qwen2_5_vl",
"video_llava", "videollava",
"vipllava", "vipllava",
] ]

View File

@@ -1381,6 +1381,7 @@ class Qwen2_5_VLTextModel(Qwen2_5_VLPreTrainedModel):
@auto_docstring @auto_docstring
class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel): class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
base_model_prefix = ""
_checkpoint_conversion_mapping = {"^model": "language_model"} _checkpoint_conversion_mapping = {"^model": "language_model"}
config_class = Qwen2_5_VLConfig config_class = Qwen2_5_VLConfig
_no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"] _no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"]

View File

@@ -414,6 +414,7 @@ class Qwen2_5_VLModelOutputWithPast(Qwen2VLModelOutputWithPast):
class Qwen2_5_VLModel(Qwen2VLModel): class Qwen2_5_VLModel(Qwen2VLModel):
config_class = Qwen2_5_VLConfig config_class = Qwen2_5_VLConfig
base_model_prefix = ""
_no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"] _no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"]
def __init__(self, config): def __init__(self, config):

View File

@@ -1341,6 +1341,7 @@ class Qwen2VLTextModel(Qwen2VLPreTrainedModel):
@auto_docstring @auto_docstring
class Qwen2VLModel(Qwen2VLPreTrainedModel): class Qwen2VLModel(Qwen2VLPreTrainedModel):
base_model_prefix = ""
_checkpoint_conversion_mapping = {"^model": "language_model"} _checkpoint_conversion_mapping = {"^model": "language_model"}
def __init__(self, config: Qwen2VLConfig): def __init__(self, config: Qwen2VLConfig):

View File

@@ -144,7 +144,6 @@ class AyaVisionVisionText2TextModelTester:
config, pixel_values = config_and_inputs config, pixel_values = config_and_inputs
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device) attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
print("attention_mask", attention_mask.shape)
# input_ids[:, -1] = self.pad_token_id # input_ids[:, -1] = self.pad_token_id
input_ids[input_ids == self.image_token_index] = self.pad_token_id input_ids[input_ids == self.image_token_index] = self.pad_token_id
input_ids[:, : self.image_seq_length] = self.image_token_index input_ids[:, : self.image_seq_length] = self.image_token_index
@@ -366,7 +365,6 @@ class AyaVisionIntegrationTest(unittest.TestCase):
output = model(**inputs) output = model(**inputs)
actual_logits = output.logits[0, -1, :5].cpu() actual_logits = output.logits[0, -1, :5].cpu()
print("actual_logits", actual_logits)
expected_logits = torch.tensor([0.4109, 0.1532, 0.8018, 2.1328, 0.5483], dtype=torch.float16) expected_logits = torch.tensor([0.4109, 0.1532, 0.8018, 2.1328, 0.5483], dtype=torch.float16)
self.assertTrue( self.assertTrue(
torch.allclose(actual_logits, expected_logits, atol=0.1), torch.allclose(actual_logits, expected_logits, atol=0.1),
@@ -400,7 +398,6 @@ class AyaVisionIntegrationTest(unittest.TestCase):
decoded_output = processor.decode( decoded_output = processor.decode(
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
) )
print("decoded_output", decoded_output)
expected_outputs = Expectations( expected_outputs = Expectations(
{ {
@@ -437,7 +434,6 @@ class AyaVisionIntegrationTest(unittest.TestCase):
decoded_output = processor.decode( decoded_output = processor.decode(
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
) )
print("decoded_output", decoded_output)
expected_output = "The image depicts a cozy scene of two cats resting on a bright pink blanket. The cats," # fmt: skip expected_output = "The image depicts a cozy scene of two cats resting on a bright pink blanket. The cats," # fmt: skip
self.assertEqual(decoded_output, expected_output) self.assertEqual(decoded_output, expected_output)
@@ -477,7 +473,6 @@ class AyaVisionIntegrationTest(unittest.TestCase):
# Check first output # Check first output
decoded_output = processor.decode(output[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True) decoded_output = processor.decode(output[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
print("decoded_output", decoded_output)
expected_outputs = Expectations( expected_outputs = Expectations(
{ {
("xpu", 3): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest lake.", ("xpu", 3): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest lake.",
@@ -494,7 +489,6 @@ class AyaVisionIntegrationTest(unittest.TestCase):
# Check second output # Check second output
decoded_output = processor.decode(output[1, inputs["input_ids"].shape[1] :], skip_special_tokens=True) decoded_output = processor.decode(output[1, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
print("decoded_output", decoded_output)
expected_output = 'This image captures a vibrant street scene in a bustling urban area, likely in an Asian city. The focal point is a' # fmt: skip expected_output = 'This image captures a vibrant street scene in a bustling urban area, likely in an Asian city. The focal point is a' # fmt: skip
self.assertEqual( self.assertEqual(
@@ -558,7 +552,6 @@ class AyaVisionIntegrationTest(unittest.TestCase):
) # fmt: skip ) # fmt: skip
expected_output = expected_outputs.get_expectation() expected_output = expected_outputs.get_expectation()
print("decoded_output", decoded_output)
self.assertEqual( self.assertEqual(
decoded_output, decoded_output,
expected_output, expected_output,
@@ -567,7 +560,6 @@ class AyaVisionIntegrationTest(unittest.TestCase):
# Check second output # Check second output
decoded_output = processor.decode(output[1, inputs["input_ids"].shape[1] :], skip_special_tokens=True) decoded_output = processor.decode(output[1, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
print("decoded_output", decoded_output)
expected_outputs = Expectations( expected_outputs = Expectations(
{ {
("xpu", 3): "The first image showcases the Statue of Liberty, a colossal neoclassical sculpture on Liberty Island in New York Harbor. Standing at ", ("xpu", 3): "The first image showcases the Statue of Liberty, a colossal neoclassical sculpture on Liberty Island in New York Harbor. Standing at ",