[VLMs] split out "get placeholder mask" to helper (#39777)

* batch upidate all models

* update

* forgot about llava onevision

* update

* fix tests

* delete file

* typo

* fix emu3 once and forever

* update cohere2 vision as well
This commit is contained in:
Raushan Turganbay
2025-08-01 10:01:06 +02:00
committed by GitHub
parent a115b67392
commit d3b8627b56
52 changed files with 1370 additions and 1069 deletions

View File

@@ -89,9 +89,9 @@ class DeepseekVLModelTester:
self.hidden_size = text_config["hidden_size"]
self.num_attention_heads = text_config["num_attention_heads"]
self.image_size = vision_config["image_size"]
self.num_image_tokens = vision_config["image_size"] // vision_config["patch_size"]
self.num_image_tokens = 16
self.pad_token_id = text_config["pad_token_id"]
self.image_token_id = self.vocab_size - 1
self.image_token_id = 0
def get_config(self):
return DeepseekVLConfig(
@@ -115,6 +115,7 @@ class DeepseekVLModelTester:
]
)
# fill image_tokens
input_ids[input_ids == self.num_image_tokens] = config.text_config.pad_token_id
input_ids[:, : self.num_image_tokens] = self.image_token_id
return config, input_ids, attention_mask, pixel_values

View File

@@ -198,12 +198,12 @@ class Emu3Vision2TextModelTester:
bos_token_id=1,
eos_token_id=2,
image_token_id=3,
image_size=30,
image_size=15,
codebook_size=20,
temporal_downsample_factor=1,
base_channels=32,
vq_channel_multiplier=[1, 1],
image_seq_length=100,
vq_channel_multiplier=[1, 2, 1],
image_seq_length=12,
vq_img_token_start_id=3,
):
self.parent = parent
@@ -288,6 +288,7 @@ class Emu3Vision2TextModelTester:
"base_channels": self.base_channels,
"channel_multiplier": self.vq_channel_multiplier,
"hidden_size": self.base_channels,
"attn_resolutions": [],
}
return Emu3Config(text_config=text_config, vq_config=vq_config, vocabulary_map=vocab_map)
@@ -358,6 +359,10 @@ class Emu3Vision2TextModelTest(ModelTesterMixin, GenerationTesterMixin, Pipeline
def test_generate_with_static_cache(self):
pass
# @unittest.skip("Emu3 can't be smaller than currently if we want to downsample images")
# def test_model_is_small(self):
# pass
@require_torch
class Emu3IntegrationTest(unittest.TestCase):

View File

@@ -89,7 +89,7 @@ class JanusVisionText2TextModelTester:
"use_labels": True,
"image_size": 20,
"patch_size": 5,
"num_image_tokens": 4,
"num_image_tokens": 16,
"num_channels": 3,
"is_training": True,
"hidden_size": 32,

View File

@@ -61,6 +61,7 @@ class LlavaOnevisionVisionText2TextModelTester:
parent,
ignore_index=-100,
image_token_index=1,
video_token_index=2,
projector_hidden_act="gelu",
seq_length=7,
vision_feature_select_strategy="full",
@@ -108,6 +109,7 @@ class LlavaOnevisionVisionText2TextModelTester:
self.parent = parent
self.ignore_index = ignore_index
self.image_token_index = image_token_index
self.video_token_index = video_token_index
self.projector_hidden_act = projector_hidden_act
self.vision_feature_select_strategy = vision_feature_select_strategy
self.vision_feature_layer = vision_feature_layer
@@ -134,6 +136,7 @@ class LlavaOnevisionVisionText2TextModelTester:
vision_config=self.vision_config,
ignore_index=self.ignore_index,
image_token_index=self.image_token_index,
video_token_index=self.video_token_index,
projector_hidden_act=self.projector_hidden_act,
vision_feature_select_strategy=self.vision_feature_select_strategy,
vision_feature_layer=self.vision_feature_layer,