[VLMs] split out "get placeholder mask" to helper (#39777)
* batch upidate all models * update * forgot about llava onevision * update * fix tests * delete file * typo * fix emu3 once and forever * update cohere2 vision as well
This commit is contained in:
committed by
GitHub
parent
a115b67392
commit
d3b8627b56
@@ -89,9 +89,9 @@ class DeepseekVLModelTester:
|
||||
self.hidden_size = text_config["hidden_size"]
|
||||
self.num_attention_heads = text_config["num_attention_heads"]
|
||||
self.image_size = vision_config["image_size"]
|
||||
self.num_image_tokens = vision_config["image_size"] // vision_config["patch_size"]
|
||||
self.num_image_tokens = 16
|
||||
self.pad_token_id = text_config["pad_token_id"]
|
||||
self.image_token_id = self.vocab_size - 1
|
||||
self.image_token_id = 0
|
||||
|
||||
def get_config(self):
|
||||
return DeepseekVLConfig(
|
||||
@@ -115,6 +115,7 @@ class DeepseekVLModelTester:
|
||||
]
|
||||
)
|
||||
# fill image_tokens
|
||||
input_ids[input_ids == self.num_image_tokens] = config.text_config.pad_token_id
|
||||
input_ids[:, : self.num_image_tokens] = self.image_token_id
|
||||
|
||||
return config, input_ids, attention_mask, pixel_values
|
||||
|
||||
@@ -198,12 +198,12 @@ class Emu3Vision2TextModelTester:
|
||||
bos_token_id=1,
|
||||
eos_token_id=2,
|
||||
image_token_id=3,
|
||||
image_size=30,
|
||||
image_size=15,
|
||||
codebook_size=20,
|
||||
temporal_downsample_factor=1,
|
||||
base_channels=32,
|
||||
vq_channel_multiplier=[1, 1],
|
||||
image_seq_length=100,
|
||||
vq_channel_multiplier=[1, 2, 1],
|
||||
image_seq_length=12,
|
||||
vq_img_token_start_id=3,
|
||||
):
|
||||
self.parent = parent
|
||||
@@ -288,6 +288,7 @@ class Emu3Vision2TextModelTester:
|
||||
"base_channels": self.base_channels,
|
||||
"channel_multiplier": self.vq_channel_multiplier,
|
||||
"hidden_size": self.base_channels,
|
||||
"attn_resolutions": [],
|
||||
}
|
||||
return Emu3Config(text_config=text_config, vq_config=vq_config, vocabulary_map=vocab_map)
|
||||
|
||||
@@ -358,6 +359,10 @@ class Emu3Vision2TextModelTest(ModelTesterMixin, GenerationTesterMixin, Pipeline
|
||||
def test_generate_with_static_cache(self):
|
||||
pass
|
||||
|
||||
# @unittest.skip("Emu3 can't be smaller than currently if we want to downsample images")
|
||||
# def test_model_is_small(self):
|
||||
# pass
|
||||
|
||||
|
||||
@require_torch
|
||||
class Emu3IntegrationTest(unittest.TestCase):
|
||||
|
||||
@@ -89,7 +89,7 @@ class JanusVisionText2TextModelTester:
|
||||
"use_labels": True,
|
||||
"image_size": 20,
|
||||
"patch_size": 5,
|
||||
"num_image_tokens": 4,
|
||||
"num_image_tokens": 16,
|
||||
"num_channels": 3,
|
||||
"is_training": True,
|
||||
"hidden_size": 32,
|
||||
|
||||
@@ -61,6 +61,7 @@ class LlavaOnevisionVisionText2TextModelTester:
|
||||
parent,
|
||||
ignore_index=-100,
|
||||
image_token_index=1,
|
||||
video_token_index=2,
|
||||
projector_hidden_act="gelu",
|
||||
seq_length=7,
|
||||
vision_feature_select_strategy="full",
|
||||
@@ -108,6 +109,7 @@ class LlavaOnevisionVisionText2TextModelTester:
|
||||
self.parent = parent
|
||||
self.ignore_index = ignore_index
|
||||
self.image_token_index = image_token_index
|
||||
self.video_token_index = video_token_index
|
||||
self.projector_hidden_act = projector_hidden_act
|
||||
self.vision_feature_select_strategy = vision_feature_select_strategy
|
||||
self.vision_feature_layer = vision_feature_layer
|
||||
@@ -134,6 +136,7 @@ class LlavaOnevisionVisionText2TextModelTester:
|
||||
vision_config=self.vision_config,
|
||||
ignore_index=self.ignore_index,
|
||||
image_token_index=self.image_token_index,
|
||||
video_token_index=self.video_token_index,
|
||||
projector_hidden_act=self.projector_hidden_act,
|
||||
vision_feature_select_strategy=self.vision_feature_select_strategy,
|
||||
vision_feature_layer=self.vision_feature_layer,
|
||||
|
||||
Reference in New Issue
Block a user