[VLMs] use only xxx_token_id for multimodal tokens (#37573)
* use only `xxx_token_id` for multimodal tokens * update modeling files as well * fixup * why fixup doesn't fix modular docstring first? * janus, need to update configs in the hub still * last fixup
This commit is contained in:
committed by
GitHub
parent
4afd3f4820
commit
2ba6b92a6f
@@ -258,6 +258,9 @@ class AriaConfig(PretrainedConfig):
|
||||
"""
|
||||
|
||||
model_type = "aria"
|
||||
attribute_map = {
|
||||
"image_token_id": "image_token_index",
|
||||
}
|
||||
sub_configs = {"text_config": AriaTextConfig, "vision_config": AutoConfig}
|
||||
|
||||
def __init__(
|
||||
|
||||
@@ -106,7 +106,7 @@ def convert_aria_llama_to_hf(text_model_id, vision_model_id, output_hub_path, ol
|
||||
config.vision_config.hidden_size = 1152
|
||||
config.vision_config.attention_heads = 16
|
||||
config.pad_token_id = 2
|
||||
config.image_token_index = 9
|
||||
config.image_token_id = 9
|
||||
config.intermediate_size = config.moe_intermediate_size
|
||||
config.auto_map = {
|
||||
"AutoConfig": "modeling_aria.AriaConfig",
|
||||
|
||||
@@ -1507,11 +1507,11 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
|
||||
if pixel_values is not None and inputs_embeds.shape[1] != 1:
|
||||
if input_ids is None:
|
||||
special_image_mask = inputs_embeds == self.get_input_embeddings()(
|
||||
torch.tensor(self.config.image_token_index, dtype=torch.long, device=inputs_embeds.device)
|
||||
torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
|
||||
)
|
||||
n_image_tokens = (special_image_mask).sum(dim=1).sum(dim=0)[0]
|
||||
else:
|
||||
image_embeds = input_ids == self.config.image_token_index
|
||||
image_embeds = input_ids == self.config.image_token_id
|
||||
special_image_mask = image_embeds.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
|
||||
n_image_tokens = (image_embeds).sum(dim=1).sum(dim=0)
|
||||
image_features = self.get_image_features(
|
||||
|
||||
@@ -266,6 +266,9 @@ class AriaConfig(PretrainedConfig):
|
||||
"""
|
||||
|
||||
model_type = "aria"
|
||||
attribute_map = {
|
||||
"image_token_id": "image_token_index",
|
||||
}
|
||||
sub_configs = {"text_config": AriaTextConfig, "vision_config": AutoConfig}
|
||||
|
||||
def __init__(
|
||||
@@ -1546,11 +1549,11 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
|
||||
if pixel_values is not None and inputs_embeds.shape[1] != 1:
|
||||
if input_ids is None:
|
||||
special_image_mask = inputs_embeds == self.get_input_embeddings()(
|
||||
torch.tensor(self.config.image_token_index, dtype=torch.long, device=inputs_embeds.device)
|
||||
torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
|
||||
)
|
||||
n_image_tokens = (special_image_mask).sum(dim=1).sum(dim=0)[0]
|
||||
else:
|
||||
image_embeds = input_ids == self.config.image_token_index
|
||||
image_embeds = input_ids == self.config.image_token_id
|
||||
special_image_mask = image_embeds.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
|
||||
n_image_tokens = (image_embeds).sum(dim=1).sum(dim=0)
|
||||
image_features = self.get_image_features(
|
||||
|
||||
Reference in New Issue
Block a user