[VLMs] use only xxx_token_id for multimodal tokens (#37573)

* use only `xxx_token_id` for multimodal tokens

* update modeling files as well

* fixup

* why fixup doesn't fix modular docstring first?

* janus, need to update configs in the hub still

* last fixup
This commit is contained in:
Raushan Turganbay
2025-04-18 17:03:39 +02:00
committed by GitHub
parent 4afd3f4820
commit 2ba6b92a6f
63 changed files with 279 additions and 141 deletions

View File

@@ -258,6 +258,9 @@ class AriaConfig(PretrainedConfig):
"""
model_type = "aria"
attribute_map = {
"image_token_id": "image_token_index",
}
sub_configs = {"text_config": AriaTextConfig, "vision_config": AutoConfig}
def __init__(

View File

@@ -106,7 +106,7 @@ def convert_aria_llama_to_hf(text_model_id, vision_model_id, output_hub_path, ol
config.vision_config.hidden_size = 1152
config.vision_config.attention_heads = 16
config.pad_token_id = 2
config.image_token_index = 9
config.image_token_id = 9
config.intermediate_size = config.moe_intermediate_size
config.auto_map = {
"AutoConfig": "modeling_aria.AriaConfig",

View File

@@ -1507,11 +1507,11 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
if pixel_values is not None and inputs_embeds.shape[1] != 1:
if input_ids is None:
special_image_mask = inputs_embeds == self.get_input_embeddings()(
torch.tensor(self.config.image_token_index, dtype=torch.long, device=inputs_embeds.device)
torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
)
n_image_tokens = (special_image_mask).sum(dim=1).sum(dim=0)[0]
else:
image_embeds = input_ids == self.config.image_token_index
image_embeds = input_ids == self.config.image_token_id
special_image_mask = image_embeds.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
n_image_tokens = (image_embeds).sum(dim=1).sum(dim=0)
image_features = self.get_image_features(

View File

@@ -266,6 +266,9 @@ class AriaConfig(PretrainedConfig):
"""
model_type = "aria"
attribute_map = {
"image_token_id": "image_token_index",
}
sub_configs = {"text_config": AriaTextConfig, "vision_config": AutoConfig}
def __init__(
@@ -1546,11 +1549,11 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
if pixel_values is not None and inputs_embeds.shape[1] != 1:
if input_ids is None:
special_image_mask = inputs_embeds == self.get_input_embeddings()(
torch.tensor(self.config.image_token_index, dtype=torch.long, device=inputs_embeds.device)
torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
)
n_image_tokens = (special_image_mask).sum(dim=1).sum(dim=0)[0]
else:
image_embeds = input_ids == self.config.image_token_index
image_embeds = input_ids == self.config.image_token_id
special_image_mask = image_embeds.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
n_image_tokens = (image_embeds).sum(dim=1).sum(dim=0)
image_features = self.get_image_features(