[VLM] fix loading issues (#38051)
* fix qwen2-vl loading * fix a few nore models * delete print * fix copies
This commit is contained in:
committed by
GitHub
parent
a31fa218ad
commit
a5c6172c81
@@ -219,22 +219,19 @@ TORCH_INIT_FUNCTIONS = {
|
|||||||
# DO NOT MODIFY, KEPT FOR BC ONLY
|
# DO NOT MODIFY, KEPT FOR BC ONLY
|
||||||
VLMS = [
|
VLMS = [
|
||||||
"aria",
|
"aria",
|
||||||
"aya_vision",
|
"ayavision",
|
||||||
"emu3",
|
"emu3",
|
||||||
"fuyu",
|
"fuyu",
|
||||||
"got_ocr2",
|
"gotocr2",
|
||||||
"gemma3",
|
"gemma3",
|
||||||
"internvl",
|
"internvl",
|
||||||
"llava",
|
"llava", # all llava prefixed models fall under this check
|
||||||
"llava_next",
|
|
||||||
"llava_next_video",
|
|
||||||
"llava_onevision",
|
|
||||||
"mistral3",
|
"mistral3",
|
||||||
"mllama",
|
"mllama",
|
||||||
"paligemma",
|
"paligemma",
|
||||||
"qwen2_vl",
|
"qwen2vl",
|
||||||
"qwem2_5_vl",
|
"qwen2_5_vl",
|
||||||
"video_llava",
|
"videollava",
|
||||||
"vipllava",
|
"vipllava",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -1381,6 +1381,7 @@ class Qwen2_5_VLTextModel(Qwen2_5_VLPreTrainedModel):
|
|||||||
|
|
||||||
@auto_docstring
|
@auto_docstring
|
||||||
class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
|
class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
|
||||||
|
base_model_prefix = ""
|
||||||
_checkpoint_conversion_mapping = {"^model": "language_model"}
|
_checkpoint_conversion_mapping = {"^model": "language_model"}
|
||||||
config_class = Qwen2_5_VLConfig
|
config_class = Qwen2_5_VLConfig
|
||||||
_no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"]
|
_no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"]
|
||||||
|
|||||||
@@ -414,6 +414,7 @@ class Qwen2_5_VLModelOutputWithPast(Qwen2VLModelOutputWithPast):
|
|||||||
|
|
||||||
class Qwen2_5_VLModel(Qwen2VLModel):
|
class Qwen2_5_VLModel(Qwen2VLModel):
|
||||||
config_class = Qwen2_5_VLConfig
|
config_class = Qwen2_5_VLConfig
|
||||||
|
base_model_prefix = ""
|
||||||
_no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"]
|
_no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"]
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
|
|||||||
@@ -1341,6 +1341,7 @@ class Qwen2VLTextModel(Qwen2VLPreTrainedModel):
|
|||||||
|
|
||||||
@auto_docstring
|
@auto_docstring
|
||||||
class Qwen2VLModel(Qwen2VLPreTrainedModel):
|
class Qwen2VLModel(Qwen2VLPreTrainedModel):
|
||||||
|
base_model_prefix = ""
|
||||||
_checkpoint_conversion_mapping = {"^model": "language_model"}
|
_checkpoint_conversion_mapping = {"^model": "language_model"}
|
||||||
|
|
||||||
def __init__(self, config: Qwen2VLConfig):
|
def __init__(self, config: Qwen2VLConfig):
|
||||||
|
|||||||
@@ -144,7 +144,6 @@ class AyaVisionVisionText2TextModelTester:
|
|||||||
config, pixel_values = config_and_inputs
|
config, pixel_values = config_and_inputs
|
||||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
|
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
|
||||||
print("attention_mask", attention_mask.shape)
|
|
||||||
# input_ids[:, -1] = self.pad_token_id
|
# input_ids[:, -1] = self.pad_token_id
|
||||||
input_ids[input_ids == self.image_token_index] = self.pad_token_id
|
input_ids[input_ids == self.image_token_index] = self.pad_token_id
|
||||||
input_ids[:, : self.image_seq_length] = self.image_token_index
|
input_ids[:, : self.image_seq_length] = self.image_token_index
|
||||||
@@ -366,7 +365,6 @@ class AyaVisionIntegrationTest(unittest.TestCase):
|
|||||||
output = model(**inputs)
|
output = model(**inputs)
|
||||||
|
|
||||||
actual_logits = output.logits[0, -1, :5].cpu()
|
actual_logits = output.logits[0, -1, :5].cpu()
|
||||||
print("actual_logits", actual_logits)
|
|
||||||
expected_logits = torch.tensor([0.4109, 0.1532, 0.8018, 2.1328, 0.5483], dtype=torch.float16)
|
expected_logits = torch.tensor([0.4109, 0.1532, 0.8018, 2.1328, 0.5483], dtype=torch.float16)
|
||||||
self.assertTrue(
|
self.assertTrue(
|
||||||
torch.allclose(actual_logits, expected_logits, atol=0.1),
|
torch.allclose(actual_logits, expected_logits, atol=0.1),
|
||||||
@@ -400,7 +398,6 @@ class AyaVisionIntegrationTest(unittest.TestCase):
|
|||||||
decoded_output = processor.decode(
|
decoded_output = processor.decode(
|
||||||
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
|
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
|
||||||
)
|
)
|
||||||
print("decoded_output", decoded_output)
|
|
||||||
|
|
||||||
expected_outputs = Expectations(
|
expected_outputs = Expectations(
|
||||||
{
|
{
|
||||||
@@ -437,7 +434,6 @@ class AyaVisionIntegrationTest(unittest.TestCase):
|
|||||||
decoded_output = processor.decode(
|
decoded_output = processor.decode(
|
||||||
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
|
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
|
||||||
)
|
)
|
||||||
print("decoded_output", decoded_output)
|
|
||||||
expected_output = "The image depicts a cozy scene of two cats resting on a bright pink blanket. The cats," # fmt: skip
|
expected_output = "The image depicts a cozy scene of two cats resting on a bright pink blanket. The cats," # fmt: skip
|
||||||
self.assertEqual(decoded_output, expected_output)
|
self.assertEqual(decoded_output, expected_output)
|
||||||
|
|
||||||
@@ -477,7 +473,6 @@ class AyaVisionIntegrationTest(unittest.TestCase):
|
|||||||
|
|
||||||
# Check first output
|
# Check first output
|
||||||
decoded_output = processor.decode(output[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
|
decoded_output = processor.decode(output[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
|
||||||
print("decoded_output", decoded_output)
|
|
||||||
expected_outputs = Expectations(
|
expected_outputs = Expectations(
|
||||||
{
|
{
|
||||||
("xpu", 3): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest lake.",
|
("xpu", 3): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest lake.",
|
||||||
@@ -494,7 +489,6 @@ class AyaVisionIntegrationTest(unittest.TestCase):
|
|||||||
|
|
||||||
# Check second output
|
# Check second output
|
||||||
decoded_output = processor.decode(output[1, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
|
decoded_output = processor.decode(output[1, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
|
||||||
print("decoded_output", decoded_output)
|
|
||||||
expected_output = 'This image captures a vibrant street scene in a bustling urban area, likely in an Asian city. The focal point is a' # fmt: skip
|
expected_output = 'This image captures a vibrant street scene in a bustling urban area, likely in an Asian city. The focal point is a' # fmt: skip
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
@@ -558,7 +552,6 @@ class AyaVisionIntegrationTest(unittest.TestCase):
|
|||||||
) # fmt: skip
|
) # fmt: skip
|
||||||
expected_output = expected_outputs.get_expectation()
|
expected_output = expected_outputs.get_expectation()
|
||||||
|
|
||||||
print("decoded_output", decoded_output)
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
decoded_output,
|
decoded_output,
|
||||||
expected_output,
|
expected_output,
|
||||||
@@ -567,7 +560,6 @@ class AyaVisionIntegrationTest(unittest.TestCase):
|
|||||||
|
|
||||||
# Check second output
|
# Check second output
|
||||||
decoded_output = processor.decode(output[1, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
|
decoded_output = processor.decode(output[1, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
|
||||||
print("decoded_output", decoded_output)
|
|
||||||
expected_outputs = Expectations(
|
expected_outputs = Expectations(
|
||||||
{
|
{
|
||||||
("xpu", 3): "The first image showcases the Statue of Liberty, a colossal neoclassical sculpture on Liberty Island in New York Harbor. Standing at ",
|
("xpu", 3): "The first image showcases the Statue of Liberty, a colossal neoclassical sculpture on Liberty Island in New York Harbor. Standing at ",
|
||||||
|
|||||||
Reference in New Issue
Block a user