From 71cc8161b2470f9a3ec433dff345e224eb3277d5 Mon Sep 17 00:00:00 2001 From: Alex Brooks Date: Thu, 23 Jan 2025 09:15:52 -0700 Subject: [PATCH] Granite Vision Support (#35579) * Add multimodal granite support Signed-off-by: Alex-Brooks Support multiple image feature layres Signed-off-by: Alex-Brooks * Remove failing validation for visual encoders with no cls Signed-off-by: Alex-Brooks * Update llava based models / configs to support list of feature layers Signed-off-by: Alex-Brooks * Add tests for multiple feature layers Signed-off-by: Alex-Brooks * Use conditional instead of except for misaligned feature shapes Signed-off-by: Alex-Brooks * crop cls from each hidden state Signed-off-by: Alex-Brooks * Fix formatting Signed-off-by: Alex-Brooks * Support single vision feature int in vipllava Signed-off-by: Alex-Brooks * Fix typo in vision feature selection strategy validation Signed-off-by: Alex-Brooks * Add tentative integration test for granite vision models Signed-off-by: Alex-Brooks * Add granite vision docs Replace multimodal granite refs with granite vision Add granite vision / llava next alias Signed-off-by: Alex-Brooks * Use image url in granitevision example Signed-off-by: Alex-Brooks --------- Signed-off-by: Alex-Brooks Signed-off-by: Alex-Brooks --- docs/source/en/_toctree.yml | 2 + docs/source/en/model_doc/granitevision.md | 90 +++++++++++++++++++ .../models/auto/configuration_auto.py | 3 + .../models/llava/configuration_llava.py | 6 +- .../models/llava/modeling_llava.py | 51 +++++++---- .../llava_next/configuration_llava_next.py | 6 +- .../models/llava_next/modeling_llava_next.py | 50 +++++++---- .../configuration_llava_next_video.py | 6 +- .../modeling_llava_next_video.py | 70 +++++++++++---- .../modular_llava_next_video.py | 46 +++++++--- .../configuration_llava_onevision.py | 6 +- .../modeling_llava_onevision.py | 51 ++++++++--- .../video_llava/configuration_video_llava.py | 6 +- .../video_llava/modeling_video_llava.py | 66 ++++++++++---- .../models/vipllava/configuration_vipllava.py | 4 +- .../models/vipllava/modeling_vipllava.py | 26 +++--- tests/models/llava/test_modeling_llava.py | 27 ++++++ .../llava_next/test_modeling_llava_next.py | 49 ++++++++++ .../test_modeling_llava_next_video.py | 27 ++++++ .../test_modeling_llava_onevision.py | 27 ++++++ .../video_llava/test_modeling_video_llava.py | 27 ++++++ .../models/vipllava/test_modeling_vipllava.py | 32 +++++++ 22 files changed, 567 insertions(+), 111 deletions(-) create mode 100644 docs/source/en/model_doc/granitevision.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 4eb5b14e23..34aacd0796 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -448,6 +448,8 @@ title: Granite - local: model_doc/granitemoe title: GraniteMoe + - local: model_doc/granitevision + title: GraniteVision - local: model_doc/helium title: Helium - local: model_doc/herbert diff --git a/docs/source/en/model_doc/granitevision.md b/docs/source/en/model_doc/granitevision.md new file mode 100644 index 0000000000..42f9df2ee3 --- /dev/null +++ b/docs/source/en/model_doc/granitevision.md @@ -0,0 +1,90 @@ + + +# Granite Vision + +## Overview + +The Granite Vision model is a variant of [LLaVA-NeXT](llava_next), leveraging a [Granite](granite) language model alongside a [SigLIP](SigLIP) visual encoder. It utilizes multiple concatenated vision hidden states as its image features, similar to [VipLlava](vipllava). It also uses a larger set of image grid pinpoints than the original LlaVa-NeXT models to support additional aspect ratios. + +Tips: +- This model is loaded into Transformers as an instance of LlaVA-Next. The usage and tips from [LLaVA-NeXT](llava_next) apply to this model as well. + +- You can apply the chat template on the tokenizer / processor in the same way as well. Example chat format: +```bash +"<|user|>\nWhat’s shown in this image?\n<|assistant|>\nThis image shows a red stop sign.<|end_of_text|><|user|>\nDescribe the image in more details.\n<|assistant|>\n" +``` + +Sample inference: +```python +from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration +from PIL import Image +import requests + +# Note: These docs were written prior to the public model release, +# and this path is subject to change. +# Please see https://huggingface.co/ibm-granite for the current model list. +model_path = "ibm-granite/granite-3.1-2b-instruct-vision" +processor = LlavaNextProcessor.from_pretrained(model_path) + +model = LlavaNextForConditionalGeneration.from_pretrained(model_path).to("cuda") + +# prepare image and text prompt, using the appropriate prompt template +url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true" + +conversation = [ + { + "role": "user", + "content": [ + {"type": "image", "url": url}, + {"type": "text", "text": "What is shown in this image?"}, + ], + }, +] +inputs = processor.apply_chat_template( + conversation, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors="pt" +).to("cuda") + + +# autoregressively complete prompt +output = model.generate(**inputs, max_new_tokens=100) + +print(processor.decode(output[0], skip_special_tokens=True)) +``` + +This model was contributed by [Alexander Brooks](https://huggingface.co/abrooks9944). + +## LlavaNextConfig + +[[autodoc]] LlavaNextConfig + +## LlavaNextImageProcessor + +[[autodoc]] LlavaNextImageProcessor + - preprocess + +## LlavaNextProcessor + +[[autodoc]] LlavaNextProcessor + +## LlavaNextForConditionalGeneration + +[[autodoc]] LlavaNextForConditionalGeneration + - forward diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index aaac3551d6..f4590c81c7 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -134,6 +134,7 @@ CONFIG_MAPPING_NAMES = OrderedDict( ("gptsan-japanese", "GPTSanJapaneseConfig"), ("granite", "GraniteConfig"), ("granitemoe", "GraniteMoeConfig"), + ("granitevision", "LlavaNextConfig"), ("graphormer", "GraphormerConfig"), ("grounding-dino", "GroundingDinoConfig"), ("groupvit", "GroupViTConfig"), @@ -458,6 +459,7 @@ MODEL_NAMES_MAPPING = OrderedDict( ("gptsan-japanese", "GPTSAN-japanese"), ("granite", "Granite"), ("granitemoe", "GraniteMoeMoe"), + ("granitevision", "LLaVA-NeXT"), ("graphormer", "Graphormer"), ("grounding-dino", "Grounding DINO"), ("groupvit", "GroupViT"), @@ -729,6 +731,7 @@ SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict( ("siglip_vision_model", "siglip"), ("chinese_clip_vision_model", "chinese_clip"), ("rt_detr_resnet", "rt_detr"), + ("granitevision", "llava_next"), ] ) diff --git a/src/transformers/models/llava/configuration_llava.py b/src/transformers/models/llava/configuration_llava.py index 58bf40d6ce..d2a3e9747b 100644 --- a/src/transformers/models/llava/configuration_llava.py +++ b/src/transformers/models/llava/configuration_llava.py @@ -46,8 +46,10 @@ class LlavaConfig(PretrainedConfig): vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): The feature selection strategy used to select the vision feature from the vision backbone. Can be one of `"default"` or `"full"`. - vision_feature_layer (`int`, *optional*, defaults to -2): - The index of the layer to select the vision feature. + vision_feature_layer (`Union[int, List[int]]`, *optional*, defaults to -2): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. image_seq_length (`int`, *optional*, defaults to 576): Sequence length of one image embedding. multimodal_projector_bias (`bool`, *optional*, defaults to `True`): diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index fcf016f28f..7b40550722 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -87,8 +87,12 @@ class LlavaCausalLMOutputWithPast(ModelOutput): class LlavaMultiModalProjector(nn.Module): def __init__(self, config: LlavaConfig): super().__init__() + # We have hidden_size * the number of vision feature layers + num_feature_layers = 1 if isinstance(config.vision_feature_layer, int) else len(config.vision_feature_layer) self.linear_1 = nn.Linear( - config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias + config.vision_config.hidden_size * num_feature_layers, + config.text_config.hidden_size, + bias=config.multimodal_projector_bias, ) self.act = ACT2FN[config.projector_hidden_act] self.linear_2 = nn.Linear( @@ -208,8 +212,10 @@ LLAVA_INPUTS_DOCSTRING = r""" Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. - vision_feature_layer (`int`, *optional*, defaults to -2): - The index of the layer to select the vision feature. + vision_feature_layer (`Union[int, List[int]], *optional*, defaults to -2`): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): The feature selection strategy used to select the vision feature from the vision backbone. Can be one of `"default"` or `"full"`. @@ -270,7 +276,10 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin): return self.language_model.get_decoder() def get_image_features( - self, pixel_values: torch.FloatTensor, vision_feature_layer: int, vision_feature_select_strategy: str + self, + pixel_values: torch.FloatTensor, + vision_feature_layer: Union[int, List[int]], + vision_feature_select_strategy: str, ): """ Obtains image last hidden states from the vision tower and apply multimodal projection. @@ -278,23 +287,35 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin): Args: pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) The tensors corresponding to the input images. - vision_feature_layer (`int`): - The index of the layer to select the vision feature. + vision_feature_layer (`Union[int, List[int]]`): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. vision_feature_select_strategy (`str`): The feature selection strategy used to select the vision feature from the vision backbone. Can be one of `"default"` or `"full"` Returns: image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`). """ - image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) - # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated. - selected_image_feature = image_outputs.hidden_states[vision_feature_layer] - if vision_feature_select_strategy == "default": - selected_image_feature = selected_image_feature[:, 1:] - elif vision_feature_select_strategy == "full": - selected_image_feature = selected_image_feature - else: + if vision_feature_select_strategy not in ["default", "full"]: raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}") + + # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states. + image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) + + # If we have one vision feature layer, return the corresponding hidden states, + # otherwise, select the hidden states of each feature layer and concatenate them + if isinstance(vision_feature_layer, int): + selected_image_feature = image_outputs.hidden_states[vision_feature_layer] + if vision_feature_select_strategy == "default": + selected_image_feature = selected_image_feature[:, 1:] + else: + hs_pool = [image_outputs.hidden_states[layer_idx] for layer_idx in vision_feature_layer] + # For default; crop CLS from each hidden state in the hidden state pool + if vision_feature_select_strategy == "default": + hs_pool = [hs[:, 1:] for hs in hs_pool] + selected_image_feature = torch.cat(hs_pool, dim=-1) + image_features = self.multi_modal_projector(selected_image_feature) return image_features @@ -392,7 +413,7 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin): position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, - vision_feature_layer: Optional[int] = None, + vision_feature_layer: Optional[Union[int, List[int]]] = None, vision_feature_select_strategy: Optional[str] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, diff --git a/src/transformers/models/llava_next/configuration_llava_next.py b/src/transformers/models/llava_next/configuration_llava_next.py index 6cb76c5b9d..2610275ced 100644 --- a/src/transformers/models/llava_next/configuration_llava_next.py +++ b/src/transformers/models/llava_next/configuration_llava_next.py @@ -46,8 +46,10 @@ class LlavaNextConfig(PretrainedConfig): The feature selection strategy used to select the vision feature from the vision backbone. Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features. If `"full"`, the full vision features are used. - vision_feature_layer (`int`, *optional*, defaults to -2): - The index of the layer to select the vision feature. + vision_feature_layer (`Union[int, List[int]]`, *optional*, defaults to -2): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. image_grid_pinpoints (`List`, *optional*, defaults to `[[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]`): A list of possible resolutions to use for processing high resolution images. Each item in the list should be a tuple or list of the form `(height, width)`. diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 8bff9dc900..b9387eaab0 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -195,8 +195,12 @@ class LlavaNextCausalLMOutputWithPast(ModelOutput): class LlavaNextMultiModalProjector(nn.Module): def __init__(self, config: LlavaNextConfig): super().__init__() + # We have hidden_size * the number of vision feature layers + num_feature_layers = 1 if isinstance(config.vision_feature_layer, int) else len(config.vision_feature_layer) self.linear_1 = nn.Linear( - config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias + config.vision_config.hidden_size * num_feature_layers, + config.text_config.hidden_size, + bias=config.multimodal_projector_bias, ) self.act = ACT2FN[config.projector_hidden_act] self.linear_2 = nn.Linear( @@ -319,8 +323,10 @@ LLAVA_NEXT_INPUTS_DOCSTRING = r""" Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. - vision_feature_layer (`int`, *optional*, defaults to -2): - The index of the layer to select the vision feature. + vision_feature_layer (`Union[int, List[int]], *optional*, defaults to -2`): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): The feature selection strategy used to select the vision feature from the vision backbone. Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features. @@ -664,18 +670,22 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi image_feature = image_feature[1:] height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size - if vision_feature_select_strategy == "default": - expected_num_patches = height * width - elif vision_feature_select_strategy == "full": - expected_num_patches = height * width + 1 - if expected_num_patches != base_image_feature.shape[0]: - raise ValueError("The number of patches is not consistent with the image size.") - num_patch_height, num_patch_width = get_anyres_image_grid_shape( image_sizes[image_idx], self.config.image_grid_pinpoints, self.config.vision_config.image_size, ) + + if ( + np.prod(image_feature.shape) % (num_patch_height * num_patch_width * height * width) != 0 + and vision_feature_select_strategy == "default" + ): + logger.warning_once( + "Image feature shape does not line up with the provided patch size. " + "You may be using the `default` vision_feature_select_strategy with a" + " visual encoder that does not have CLS." + ) + image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1) image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous() image_feature = image_feature.flatten(1, 2).flatten(2, 3) @@ -706,7 +716,7 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi self, pixel_values: torch.FloatTensor, image_sizes: torch.Tensor, - vision_feature_layer: int, + vision_feature_layer: Union[int, List[int]], vision_feature_select_strategy: str, ): """ @@ -717,8 +727,10 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi The tensors corresponding to the input images. image_sizes (`torch.Tensor` of shape `(num_images, 2)`) Actual image size of each images (H, W). - vision_feature_layer (`int`): - The index of the layer to select the vision feature. + vision_feature_layer (`Union[int, List[int]]`): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. vision_feature_select_strategy (`str`): The feature selection strategy used to select the vision feature from the vision backbone. Can be one of `"default"` or `"full"` @@ -744,11 +756,19 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions") image_features = self.vision_tower(pixel_values, output_hidden_states=True) - selected_image_feature = image_features.hidden_states[vision_feature_layer] + # If we have one vision feature layer, return the corresponding hidden states, + # otherwise, select the hidden states of each feature layer and concatenate them + if isinstance(vision_feature_layer, int): + selected_image_feature = image_features.hidden_states[vision_feature_layer] + else: + hs_pool = [image_features.hidden_states[layer_idx] for layer_idx in vision_feature_layer] + selected_image_feature = torch.cat(hs_pool, dim=-1) + if vision_feature_select_strategy == "default": selected_image_feature = selected_image_feature[:, 1:] elif vision_feature_select_strategy == "full": selected_image_feature = selected_image_feature + image_features = self.multi_modal_projector(selected_image_feature) image_features = torch.split(image_features, image_num_patches, dim=0) return image_features @@ -765,7 +785,7 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, - vision_feature_layer: Optional[int] = None, + vision_feature_layer: Optional[Union[int, List[int]]] = None, vision_feature_select_strategy: Optional[str] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, diff --git a/src/transformers/models/llava_next_video/configuration_llava_next_video.py b/src/transformers/models/llava_next_video/configuration_llava_next_video.py index 77089ed0f3..6b85ebb445 100644 --- a/src/transformers/models/llava_next_video/configuration_llava_next_video.py +++ b/src/transformers/models/llava_next_video/configuration_llava_next_video.py @@ -50,8 +50,10 @@ class LlavaNextVideoConfig(PretrainedConfig): The feature selection strategy used to select the vision feature from the vision backbone. Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features. If `"full"`, the full vision features are used. - vision_feature_layer (`int`, *optional*, defaults to -2): - The index of the layer to select the vision feature. + vision_feature_layer (`Union[int, List[int]]`, *optional*, defaults to -2): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. image_grid_pinpoints (`List`, *optional*, defaults to `[[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]`): A list of possible resolutions to use for processing high resolution images. Each item in the list should be a tuple or list of the form `(height, width)`. diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index c82d52bfda..3e288520ed 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -180,8 +180,12 @@ class LlavaNextVideoPreTrainedModel(PreTrainedModel): class LlavaNextVideoMultiModalProjector(nn.Module): def __init__(self, config: LlavaNextVideoConfig): super().__init__() + # We have hidden_size * the number of vision feature layers + num_feature_layers = 1 if isinstance(config.vision_feature_layer, int) else len(config.vision_feature_layer) self.linear_1 = nn.Linear( - config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias + config.vision_config.hidden_size * num_feature_layers, + config.text_config.hidden_size, + bias=config.multimodal_projector_bias, ) self.act = ACT2FN[config.projector_hidden_act] self.linear_2 = nn.Linear( @@ -356,8 +360,10 @@ LLAVA_NEXT_VIDEO_INPUTS_DOCSTRING = r""" Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. - vision_feature_layer (`int`, *optional*, defaults to -2): - The index of the layer to select the vision feature. + vision_feature_layer (`Union[int, List[int]], *optional*, defaults to -2`): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): The feature selection strategy used to select the vision feature from the vision backbone. Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features. @@ -699,18 +705,22 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, Gene image_feature = image_feature[1:] height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size - if vision_feature_select_strategy == "default": - expected_num_patches = height * width - elif vision_feature_select_strategy == "full": - expected_num_patches = height * width + 1 - if expected_num_patches != base_image_feature.shape[0]: - raise ValueError("The number of patches is not consistent with the image size.") - num_patch_height, num_patch_width = get_anyres_image_grid_shape( image_sizes[image_idx], self.config.image_grid_pinpoints, self.config.vision_config.image_size, ) + + if ( + np.prod(image_feature.shape) % (num_patch_height * num_patch_width * height * width) != 0 + and vision_feature_select_strategy == "default" + ): + logger.warning_once( + "Image feature shape does not line up with the provided patch size. " + "You may be using the `default` vision_feature_select_strategy with a" + " visual encoder that does not have CLS." + ) + image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1) image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous() image_feature = image_feature.flatten(1, 2).flatten(2, 3) @@ -741,7 +751,7 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, Gene self, pixel_values: torch.FloatTensor, image_sizes: torch.Tensor, - vision_feature_layer: int, + vision_feature_layer: Union[int, List[int]], vision_feature_select_strategy: str, ): """ @@ -752,8 +762,10 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, Gene The tensors corresponding to the input images. image_sizes (`torch.Tensor` of shape `(num_images, 2)`) Actual image size of each images (H, W). - vision_feature_layer (`int`): - The index of the layer to select the vision feature. + vision_feature_layer (`Union[int, List[int]]`): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. vision_feature_select_strategy (`str`): The feature selection strategy used to select the vision feature from the vision backbone. Can be one of `"default"` or `"full"` @@ -779,7 +791,14 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, Gene raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions") image_features = self.vision_tower(pixel_values, output_hidden_states=True) - selected_image_feature = image_features.hidden_states[vision_feature_layer] + # If we have one vision feature layer, return the corresponding hidden states, + # otherwise, select the hidden states of each feature layer and concatenate them + if isinstance(vision_feature_layer, int): + selected_image_feature = image_features.hidden_states[vision_feature_layer] + else: + hs_pool = [image_features.hidden_states[layer_idx] for layer_idx in vision_feature_layer] + selected_image_feature = torch.cat(hs_pool, dim=-1) + if vision_feature_select_strategy == "default": selected_image_feature = selected_image_feature[:, 1:] elif vision_feature_select_strategy == "full": @@ -801,7 +820,7 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, Gene position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, - vision_feature_layer: Optional[int] = None, + vision_feature_layer: Optional[Union[int, List[int]]] = None, vision_feature_select_strategy: Optional[str] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, @@ -1043,7 +1062,10 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, Gene return model_inputs def get_video_features( - self, pixel_values: torch.FloatTensor, vision_feature_layer: int, vision_feature_select_strategy: str + self, + pixel_values: torch.FloatTensor, + vision_feature_layer: Union[int, List[int]], + vision_feature_select_strategy: str, ): """ Obtains video last hidden states from the vision tower and apply multimodal projection. @@ -1051,8 +1073,10 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, Gene Args: pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`) The tensors corresponding to the input video. - vision_feature_layer (`int`): - The index of the layer to select the vision feature. + vision_feature_layer (`Union[int, List[int]]`): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. vision_feature_select_strategy (`str`): The feature selection strategy used to select the vision feature from the vision backbone. Can be one of `"default"` or `"full"` @@ -1063,7 +1087,15 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, Gene batch_size, frames, channels, height, width = pixel_values.shape pixel_values = pixel_values.reshape(batch_size * frames, channels, height, width) video_features = self.vision_tower(pixel_values, output_hidden_states=True) - selected_video_features = video_features.hidden_states[vision_feature_layer] + + # If we have one vision feature layer, return the corresponding hidden states, + # otherwise, select the hidden states of each feature layer and concatenate them + if isinstance(vision_feature_layer, int): + selected_video_features = video_features.hidden_states[vision_feature_layer] + else: + hs_pool = [video_features.hidden_states[layer_idx] for layer_idx in vision_feature_layer] + selected_video_features = torch.cat(hs_pool, dim=-1) + if vision_feature_select_strategy == "default": selected_video_features = selected_video_features[:, 1:] elif vision_feature_select_strategy == "full": diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index 580f890b42..77e0f08c7e 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -64,8 +64,10 @@ class LlavaNextVideoConfig(PretrainedConfig): The feature selection strategy used to select the vision feature from the vision backbone. Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features. If `"full"`, the full vision features are used. - vision_feature_layer (`int`, *optional*, defaults to -2): - The index of the layer to select the vision feature. + vision_feature_layer (`Union[int, List[int]]`, *optional*, defaults to -2): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. image_grid_pinpoints (`List`, *optional*, defaults to `[[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]`): A list of possible resolutions to use for processing high resolution images. Each item in the list should be a tuple or list of the form `(height, width)`. @@ -237,7 +239,7 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration): self, pixel_values: torch.FloatTensor, image_sizes: torch.Tensor, - vision_feature_layer: int, + vision_feature_layer: Union[int, List[int]], vision_feature_select_strategy: str, ): """ @@ -248,8 +250,10 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration): The tensors corresponding to the input images. image_sizes (`torch.Tensor` of shape `(num_images, 2)`) Actual image size of each images (H, W). - vision_feature_layer (`int`): - The index of the layer to select the vision feature. + vision_feature_layer (`Union[int, List[int]]`): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. vision_feature_select_strategy (`str`): The feature selection strategy used to select the vision feature from the vision backbone. Can be one of `"default"` or `"full"` @@ -275,7 +279,14 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration): raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions") image_features = self.vision_tower(pixel_values, output_hidden_states=True) - selected_image_feature = image_features.hidden_states[vision_feature_layer] + # If we have one vision feature layer, return the corresponding hidden states, + # otherwise, select the hidden states of each feature layer and concatenate them + if isinstance(vision_feature_layer, int): + selected_image_feature = image_features.hidden_states[vision_feature_layer] + else: + hs_pool = [image_features.hidden_states[layer_idx] for layer_idx in vision_feature_layer] + selected_image_feature = torch.cat(hs_pool, dim=-1) + if vision_feature_select_strategy == "default": selected_image_feature = selected_image_feature[:, 1:] elif vision_feature_select_strategy == "full": @@ -285,7 +296,10 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration): return image_features def get_video_features( - self, pixel_values: torch.FloatTensor, vision_feature_layer: int, vision_feature_select_strategy: str + self, + pixel_values: torch.FloatTensor, + vision_feature_layer: Union[int, List[int]], + vision_feature_select_strategy: str, ): """ Obtains video last hidden states from the vision tower and apply multimodal projection. @@ -293,8 +307,10 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration): Args: pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`) The tensors corresponding to the input video. - vision_feature_layer (`int`): - The index of the layer to select the vision feature. + vision_feature_layer (`Union[int, List[int]]`): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. vision_feature_select_strategy (`str`): The feature selection strategy used to select the vision feature from the vision backbone. Can be one of `"default"` or `"full"` @@ -305,7 +321,15 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration): batch_size, frames, channels, height, width = pixel_values.shape pixel_values = pixel_values.reshape(batch_size * frames, channels, height, width) video_features = self.vision_tower(pixel_values, output_hidden_states=True) - selected_video_features = video_features.hidden_states[vision_feature_layer] + + # If we have one vision feature layer, return the corresponding hidden states, + # otherwise, select the hidden states of each feature layer and concatenate them + if isinstance(vision_feature_layer, int): + selected_video_features = video_features.hidden_states[vision_feature_layer] + else: + hs_pool = [video_features.hidden_states[layer_idx] for layer_idx in vision_feature_layer] + selected_video_features = torch.cat(hs_pool, dim=-1) + if vision_feature_select_strategy == "default": selected_video_features = selected_video_features[:, 1:] elif vision_feature_select_strategy == "full": @@ -327,7 +351,7 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration): position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, - vision_feature_layer: Optional[int] = None, + vision_feature_layer: Optional[Union[int, List[int]]] = None, vision_feature_select_strategy: Optional[str] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, diff --git a/src/transformers/models/llava_onevision/configuration_llava_onevision.py b/src/transformers/models/llava_onevision/configuration_llava_onevision.py index 504e8a7878..c3d43d69d7 100644 --- a/src/transformers/models/llava_onevision/configuration_llava_onevision.py +++ b/src/transformers/models/llava_onevision/configuration_llava_onevision.py @@ -49,8 +49,10 @@ class LlavaOnevisionConfig(PretrainedConfig): The feature selection strategy used to select the vision feature from the vision backbone. Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features. If `"full"`, the full vision features are used. - vision_feature_layer (`int`, *optional*, defaults to -1): - The index of the layer to select the vision feature. + vision_feature_layer (`Union[int, List[int]]`, *optional*, defaults to -1): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. vision_aspect_ratio (`str`, *optional*, defaults to `"anyres_max_9"`): Aspect ratio used when processong image features. The default value is "anyres_max_9". image_grid_pinpoints (`List`, *optional*): diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index f1cf7a6c2d..b75ef9ab0b 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -202,8 +202,12 @@ class LlavaOnevisionCausalLMOutputWithPast(ModelOutput): class LlavaOnevisionMultiModalProjector(nn.Module): def __init__(self, config: LlavaOnevisionConfig): super().__init__() + # We have hidden_size * the number of vision feature layers + num_feature_layers = 1 if isinstance(config.vision_feature_layer, int) else len(config.vision_feature_layer) self.linear_1 = nn.Linear( - config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias + config.vision_config.hidden_size * num_feature_layers, + config.text_config.hidden_size, + bias=config.multimodal_projector_bias, ) self.act = ACT2FN[config.projector_hidden_act] self.linear_2 = nn.Linear( @@ -334,8 +338,10 @@ LLAVA_ONEVISION_INPUTS_DOCSTRING = r""" Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. - vision_feature_layer (`int`, *optional*, defaults to -2): - The index of the layer to select the vision feature. + vision_feature_layer (`Union[int, List[int]], *optional*, defaults to -2`): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): The feature selection strategy used to select the vision feature from the vision backbone. Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features. @@ -488,7 +494,7 @@ class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel, Gene self, pixel_values: torch.FloatTensor, image_sizes: torch.Tensor, - vision_feature_layer: int, + vision_feature_layer: Union[int, List[int]], vision_feature_select_strategy: str, ): """ @@ -499,8 +505,10 @@ class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel, Gene The tensors corresponding to the input images. image_sizes (`torch.Tensor` of shape `(num_images, 2)`) Actual image size of each images (H, W). - vision_feature_layer (`int`): - The index of the layer to select the vision feature. + vision_feature_layer (`Union[int, List[int]], *optional*, defaults to -2`): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. vision_feature_select_strategy (`str`): The feature selection strategy used to select the vision feature from the vision backbone. Can be one of `"default"` or `"full"` @@ -526,7 +534,14 @@ class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel, Gene raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions") image_features = self.vision_tower(pixel_values, output_hidden_states=True) - selected_image_feature = image_features.hidden_states[vision_feature_layer] + # If we have one vision feature layer, return the corresponding hidden states, + # otherwise, select the hidden states of each feature layer and concatenate them + if isinstance(vision_feature_layer, int): + selected_image_feature = image_features.hidden_states[vision_feature_layer] + else: + hs_pool = [image_features.hidden_states[layer_idx] for layer_idx in vision_feature_layer] + selected_image_feature = torch.cat(hs_pool, dim=-1) + if vision_feature_select_strategy == "default": selected_image_feature = selected_image_feature[:, 1:] elif vision_feature_select_strategy == "full": @@ -536,7 +551,10 @@ class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel, Gene return image_features def get_video_features( - self, pixel_values: torch.FloatTensor, vision_feature_layer: int, vision_feature_select_strategy: str + self, + pixel_values: torch.FloatTensor, + vision_feature_layer: Union[int, List[int]], + vision_feature_select_strategy: str, ): """ Obtains video last hidden states from the vision tower, apply multimodal projection and pooling. @@ -544,8 +562,10 @@ class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel, Gene Args: pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`) The tensors corresponding to the input video. - vision_feature_layer (`int`): - The index of the layer to select the vision feature. + vision_feature_layer (`Union[int, List[int]], *optional*, defaults to -2`): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. vision_feature_select_strategy (`str`): The feature selection strategy used to select the vision feature from the vision backbone. Can be one of `"default"` or `"full"` @@ -556,7 +576,14 @@ class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel, Gene batch_size, frames, channels, height, width = pixel_values.shape pixel_values = pixel_values.view(batch_size * frames, channels, height, width) video_features = self.vision_tower(pixel_values, output_hidden_states=True) - selected_video_feature = video_features.hidden_states[vision_feature_layer] + + # If we have one vision feature layer, return the corresponding hidden states, + # otherwise, select the hidden states of each feature layer and concatenate them + if isinstance(vision_feature_layer, int): + selected_video_feature = video_features.hidden_states[vision_feature_layer] + else: + hs_pool = [video_features.hidden_states[layer_idx] for layer_idx in vision_feature_layer] + selected_video_feature = torch.cat(hs_pool, dim=-1) if vision_feature_select_strategy == "default": selected_video_feature = selected_video_feature[:, 1:] @@ -582,7 +609,7 @@ class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel, Gene position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, - vision_feature_layer: Optional[int] = None, + vision_feature_layer: Optional[Union[int, List[int]]] = None, vision_feature_select_strategy: Optional[str] = None, vision_aspect_ratio: Optional[str] = None, labels: Optional[torch.LongTensor] = None, diff --git a/src/transformers/models/video_llava/configuration_video_llava.py b/src/transformers/models/video_llava/configuration_video_llava.py index 2342e16da4..becd200403 100644 --- a/src/transformers/models/video_llava/configuration_video_llava.py +++ b/src/transformers/models/video_llava/configuration_video_llava.py @@ -49,8 +49,10 @@ class VideoLlavaConfig(PretrainedConfig): vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): The feature selection strategy used to select the vision feature from the CLIP backbone. Can be either "full" to select all features or "default" to select features without `CLS`. - vision_feature_layer (`int`, *optional*, defaults to -2): - The index of the layer to select the vision feature. + vision_feature_layer (`Union[int, List[int]]`, *optional*, defaults to -2): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. image_seq_length (`int`, *optional*, defaults to 256): Sequence length of one image embedding. video_seq_length (`int`, *optional*, defaults to 2056): diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index f592da8185..c7dd0a1f93 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -89,8 +89,12 @@ class VideoLlavaCausalLMOutputWithPast(ModelOutput): class VideoLlavaMultiModalProjector(nn.Module): def __init__(self, config: VideoLlavaConfig): super().__init__() + # We have hidden_size * the number of vision feature layers + num_feature_layers = 1 if isinstance(config.vision_feature_layer, int) else len(config.vision_feature_layer) self.linear_1 = nn.Linear( - config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias + config.vision_config.hidden_size * num_feature_layers, + config.text_config.hidden_size, + bias=config.multimodal_projector_bias, ) self.act = ACT2FN[config.projector_hidden_act] self.linear_2 = nn.Linear( @@ -210,8 +214,10 @@ VIDEO_LLAVA_INPUTS_DOCSTRING = r""" Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. - vision_feature_layer (`int`, *optional*, defaults to -2): - The index of the layer to select the vision feature. + vision_feature_layer (`Union[int, List[int]], *optional*, defaults to -2`): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): The feature selection strategy used to select the vision feature from the vision backbone. Can be one of `"default"` or `"full"` @@ -357,7 +363,10 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel, GenerationMi return final_embedding, final_attention_mask, final_labels, position_ids, final_input_ids def get_image_features( - self, pixel_values_images: torch.FloatTensor, vision_feature_layer: int, vision_feature_select_strategy: str + self, + pixel_values_images: torch.FloatTensor, + vision_feature_layer: Union[int, List[int]], + vision_feature_select_strategy: str, ): """ Obtains image last hidden states from the vision tower and apply multimodal projection. @@ -365,38 +374,53 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel, GenerationMi Args: pixel_values_images (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) The tensors corresponding to the input images. - vision_feature_layer (`int`): - The index of the layer to select the vision feature. + vision_feature_layer (`Union[int, List[int]]`): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. vision_feature_select_strategy (`str`): The feature selection strategy used to select the vision feature from the vision backbone. Can be one of `"default"` or `"full"` Returns: image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`). """ + if vision_feature_select_strategy not in ["default", "full"]: + raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}") image_outputs = self.image_tower(pixel_values_images, output_hidden_states=True) - image_outputs = image_outputs.hidden_states[vision_feature_layer].squeeze(1) - if vision_feature_select_strategy == "default": - image_outputs = image_outputs[:, 1:] - elif vision_feature_select_strategy == "full": - image_outputs = image_outputs + # If we have one vision feature layer, return the corresponding hidden states, + # otherwise, select the hidden states of each feature layer and concatenate them + if isinstance(vision_feature_layer, int): + image_outputs = image_outputs.hidden_states[vision_feature_layer] + if vision_feature_select_strategy == "default": + image_outputs = image_outputs[:, 1:] else: - raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}") + hs_pool = [image_outputs.hidden_states[layer_idx] for layer_idx in vision_feature_layer] + # For default; crop CLS from each hidden state in the hidden state pool + if vision_feature_select_strategy == "default": + hs_pool = [hs[:, 1:] for hs in hs_pool] + image_outputs = torch.cat(hs_pool, dim=-1) image_features = self.multi_modal_projector(image_outputs) return image_features - def get_video_features(self, pixel_values_videos: torch.FloatTensor, vision_feature_layer: int): + def get_video_features( + self, + pixel_values_videos: torch.FloatTensor, + vision_feature_layer: Union[int, List[int]], + ): """ Obtains video last hidden states from the vision tower and apply multimodal projection. Args: pixel_values_videos (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`) The tensors corresponding to the input videos. - vision_feature_layer (`int`): - The index of the layer to select the vision feature. + vision_feature_layer (`Union[int, List[int]]`): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. Returns: video_features (`torch.Tensor`): Video feature tensor of shape `(num_videos * num_frames, image_length, embed_dim)`). frames (`int`): Number of frames the videos have. @@ -405,7 +429,15 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel, GenerationMi pixel_values = pixel_values_videos.reshape(batch_size_vid * num_frames, channels, height, width) video_outputs = self.video_tower(pixel_values, output_hidden_states=True) - video_features = video_outputs.hidden_states[vision_feature_layer].squeeze(1) + + # If we have one vision feature layer, return the corresponding hidden states, + # otherwise, select the hidden states of each feature layer and concatenate them + if isinstance(vision_feature_layer, int): + video_features = video_outputs.hidden_states[vision_feature_layer] + else: + hs_pool = [video_outputs.hidden_states[layer_idx] for layer_idx in vision_feature_layer] + video_features = torch.cat(hs_pool, dim=-1) + video_features = self.multi_modal_projector(video_features) return video_features, num_frames @@ -422,7 +454,7 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel, GenerationMi position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, - vision_feature_layer: Optional[int] = None, + vision_feature_layer: Optional[Union[int, List[int]]] = None, vision_feature_select_strategy: Optional[str] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, diff --git a/src/transformers/models/vipllava/configuration_vipllava.py b/src/transformers/models/vipllava/configuration_vipllava.py index d301721e69..94d890c4b8 100644 --- a/src/transformers/models/vipllava/configuration_vipllava.py +++ b/src/transformers/models/vipllava/configuration_vipllava.py @@ -45,8 +45,8 @@ class VipLlavaConfig(PretrainedConfig): The activation function used by the multimodal projector. projector_layernorm_eps (`float`, *optional*, defaults to 1e-05): The layer norm epsilon of the projector layernorm - vision_feature_layers (`List[int]`, *optional*, defaults to `[-2, -5, -8, -11, 6]`): - The list of layers to select the vision features from. + vision_feature_layers (`Union[int, List[int]]`, *optional*, defaults to `[-2, -5, -8, -11, 6]`): + The vision feature layer, or list of layers to select the vision features from. image_seq_length (`int`, *optional*, defaults to 576): Sequence length of one image embedding. diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index 8ef881b771..0fbef33086 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -85,12 +85,13 @@ class VipLlavaCausalLMOutputWithPast(ModelOutput): class VipLlavaMultiModalProjector(nn.Module): def __init__(self, config: VipLlavaConfig): super().__init__() + num_feature_layers = 1 if isinstance(config.vision_feature_layers, int) else len(config.vision_feature_layers) self.projector_layernorm = nn.LayerNorm( - len(config.vision_feature_layers) * config.vision_config.hidden_size, eps=config.projector_layernorm_eps + num_feature_layers * config.vision_config.hidden_size, eps=config.projector_layernorm_eps ) self.linear_1 = nn.Linear( - len(config.vision_feature_layers) * config.vision_config.hidden_size, + num_feature_layers * config.vision_config.hidden_size, config.text_config.hidden_size, bias=True, ) @@ -270,24 +271,29 @@ class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel, GenerationMixin) return self.language_model.get_decoder() # Ignore copy - def get_image_features(self, pixel_values: torch.FloatTensor, vision_feature_layers: List[int]): + def get_image_features(self, pixel_values: torch.FloatTensor, vision_feature_layers: Union[int, List[int]]): """ Obtains image last hidden states from the vision tower and apply multimodal projection. Args: pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) The tensors corresponding to the input images. - vision_feature_layers (`List[int]`): - The list og indexes of the layers to select the vision feature. + vision_feature_layers (`Union[int, List[int]]`): + The vision feature layer, or the list of indexes of the layers to select + the vision feature. Returns: image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`). """ image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) - # For VIP-llava, the image features are computed this way - # We select the features from index 1: for the layers -2, -5, -8, -11 and 6 - image_features = [image_outputs.hidden_states[index][:, 1:] for index in vision_feature_layers] - image_features = torch.cat(image_features, dim=-1) + # If multiple feature layers are provided (which is usually the case) + # then the image features are concatenated after the CLS is removed. + if isinstance(vision_feature_layers, int): + image_features = image_outputs.hidden_states[vision_feature_layers][:, 1:] + else: + # Usually, we select the features from index 1: the layers -2, -5, -8, -11 and 6 + image_features = [image_outputs.hidden_states[index][:, 1:] for index in vision_feature_layers] + image_features = torch.cat(image_features, dim=-1) image_features = self.multi_modal_projector(image_features) return image_features @@ -386,7 +392,7 @@ class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel, GenerationMixin) position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, - vision_feature_layers: Optional[List[int]] = None, + vision_feature_layers: Optional[Union[int, List[int]]] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py index 2ad763a829..0b8ebb9a17 100644 --- a/tests/models/llava/test_modeling_llava.py +++ b/tests/models/llava/test_modeling_llava.py @@ -17,6 +17,7 @@ import unittest import requests +from parameterized import parameterized from transformers import ( AutoProcessor, @@ -272,6 +273,32 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM pixel_values = torch.cat([pixel_values, pixel_values], dim=0) _ = model(input_ids=input_ids, pixel_values=pixel_values) + @parameterized.expand( + [ + (-1,), + ([-1],), + ([-1, -2],), + ], + ) + def test_vision_feature_layers(self, vision_feature_layer): + """ + Test that we can use either one vision feature layer, or a list of + vision feature layers. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.vision_feature_layer = vision_feature_layer + + num_feature_layers = 1 if isinstance(vision_feature_layer, int) else len(vision_feature_layer) + hidden_size = config.vision_config.hidden_size + expected_features = hidden_size * num_feature_layers + + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + # We should have the right number of input features, + # and should be able to run a forward pass without exploding + assert model.multi_modal_projector.linear_1.in_features == expected_features + model(**input_dict) + @unittest.skip( reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" ) diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py index e5a841cf47..c797a2b0c4 100644 --- a/tests/models/llava_next/test_modeling_llava_next.py +++ b/tests/models/llava_next/test_modeling_llava_next.py @@ -18,6 +18,7 @@ import unittest import requests from huggingface_hub import hf_hub_download +from parameterized import parameterized from transformers import ( AutoProcessor, @@ -321,6 +322,32 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes image_sizes = torch.cat([image_sizes, image_sizes], dim=0) _ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes) + @parameterized.expand( + [ + (-1,), + ([-1],), + ([-1, -2],), + ], + ) + def test_vision_feature_layers(self, vision_feature_layer): + """ + Test that we can use either one vision feature layer, or a list of + vision feature layers. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.vision_feature_layer = vision_feature_layer + + num_feature_layers = 1 if isinstance(vision_feature_layer, int) else len(vision_feature_layer) + hidden_size = config.vision_config.hidden_size + expected_features = hidden_size * num_feature_layers + + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + # We should have the right number of input features, + # and should be able to run a forward pass without exploding + assert model.multi_modal_projector.linear_1.in_features == expected_features + model(**input_dict) + @unittest.skip( reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" ) @@ -558,3 +585,25 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase): self.processor.decode(output[0], skip_special_tokens=True), EXPECTED_DECODED_TEXT, ) + + @unittest.skip(reason="Granite multimodal [vision] models are not yet released") + @slow + def test_granite_vision(self): + """ + Check the expected output of a granite vision model, which leverages + multiple vision feature layers and a visual encoder with no CLS (siglip). + """ + # TODO @alex-jw-brooks - update the path and enable this test once the 2b model is released + granite_model_path = "llava-granite-2b" + model = LlavaNextForConditionalGeneration.from_pretrained(granite_model_path) + self.processor = AutoProcessor.from_pretrained(granite_model_path) + prompt = "<|user|>\n\nWhat is shown in this image?\n<|assistant|>\n" + inputs = self.processor(prompt, self.image, return_tensors="pt").to(model.device) + + # verify generation + output = model.generate(**inputs, max_new_tokens=30) + EXPECTED_DECODED_TEXT = "<|user|>\n\nWhat is shown in this image?\n<|assistant|>\nThe image depicts a diagram." + self.assertEqual( + self.processor.decode(output[0], skip_special_tokens=True), + EXPECTED_DECODED_TEXT, + ) diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py index 7aabb854e7..576329fcfa 100644 --- a/tests/models/llava_next_video/test_modeling_llava_next_video.py +++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py @@ -18,6 +18,7 @@ import unittest import numpy as np from huggingface_hub import hf_hub_download +from parameterized import parameterized from transformers import ( AutoProcessor, @@ -338,6 +339,32 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati image_sizes = torch.cat([image_sizes, image_sizes], dim=0) _ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes) + @parameterized.expand( + [ + (-1,), + ([-1],), + ([-1, -2],), + ], + ) + def test_vision_feature_layers(self, vision_feature_layer): + """ + Test that we can use either one vision feature layer, or a list of + vision feature layers. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.vision_feature_layer = vision_feature_layer + + num_feature_layers = 1 if isinstance(vision_feature_layer, int) else len(vision_feature_layer) + hidden_size = config.vision_config.hidden_size + expected_features = hidden_size * num_feature_layers + + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + # We should have the right number of input features, + # and should be able to run a forward pass without exploding + assert model.multi_modal_projector.linear_1.in_features == expected_features + model(**input_dict) + @unittest.skip( reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" ) diff --git a/tests/models/llava_onevision/test_modeling_llava_onevision.py b/tests/models/llava_onevision/test_modeling_llava_onevision.py index 69548213b9..45cb433d86 100644 --- a/tests/models/llava_onevision/test_modeling_llava_onevision.py +++ b/tests/models/llava_onevision/test_modeling_llava_onevision.py @@ -19,6 +19,7 @@ import unittest import numpy as np import requests from huggingface_hub import hf_hub_download +from parameterized import parameterized from transformers import ( AutoProcessor, @@ -292,6 +293,32 @@ class LlavaOnevisionForConditionalGenerationModelTest(ModelTesterMixin, Generati out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] self.assertTrue(torch.allclose(out_embeds, out_ids)) + @parameterized.expand( + [ + (-1,), + ([-1],), + ([-1, -2],), + ], + ) + def test_vision_feature_layers(self, vision_feature_layer): + """ + Test that we can use either one vision feature layer, or a list of + vision feature layers. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.vision_feature_layer = vision_feature_layer + + num_feature_layers = 1 if isinstance(vision_feature_layer, int) else len(vision_feature_layer) + hidden_size = config.vision_config.hidden_size + expected_features = hidden_size * num_feature_layers + + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + # We should have the right number of input features, + # and should be able to run a forward pass without exploding + assert model.multi_modal_projector.linear_1.in_features == expected_features + model(**input_dict) + @unittest.skip( reason="This architecure seem to not compute gradients properly when using GC, SiglipVisionModel does not support standalone training" ) diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py index 1112988c9e..fa7800bdc4 100644 --- a/tests/models/video_llava/test_modeling_video_llava.py +++ b/tests/models/video_llava/test_modeling_video_llava.py @@ -19,6 +19,7 @@ import unittest import numpy as np import requests from huggingface_hub import hf_hub_download +from parameterized import parameterized from transformers import ( VideoLlavaConfig, @@ -419,6 +420,32 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe pixel_values = torch.cat([pixel_values, pixel_values], dim=0) _ = model(input_ids=input_ids, pixel_values_images=pixel_values) + @parameterized.expand( + [ + (-1,), + ([-1],), + ([-1, -2],), + ], + ) + def test_vision_feature_layers(self, vision_feature_layer): + """ + Test that we can use either one vision feature layer, or a list of + vision feature layers. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.vision_feature_layer = vision_feature_layer + + num_feature_layers = 1 if isinstance(vision_feature_layer, int) else len(vision_feature_layer) + hidden_size = config.vision_config.hidden_size + expected_features = hidden_size * num_feature_layers + + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + # We should have the right number of input features, + # and should be able to run a forward pass without exploding + assert model.multi_modal_projector.linear_1.in_features == expected_features + model(**input_dict) + @require_torch class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py index aeb082711f..3cbac0ddef 100644 --- a/tests/models/vipllava/test_modeling_vipllava.py +++ b/tests/models/vipllava/test_modeling_vipllava.py @@ -17,6 +17,7 @@ import unittest import requests +from parameterized import parameterized from transformers import ( AutoProcessor, @@ -257,6 +258,37 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest pixel_values = torch.cat([pixel_values, pixel_values], dim=0) _ = model(input_ids=input_ids, pixel_values=pixel_values) + @parameterized.expand( + [ + (-1,), + ([-1],), + ([-1, -2],), + ], + ) + def test_vision_feature_layers(self, vision_feature_layers): + """ + Test that we can use either one vision feature layer, or a list of + vision feature layers. + """ + # NOTE: vipllava uses vision_feature_layers instead of vision_feature_layer as the + # config key. The reason is that other llava classes supported one vision feature layer + # and added support for a list of layers with granite vision support, while vipllava + # originally supported multiple feature layers, and added support for a single layer for + # for compatibility reasons. + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.vision_feature_layers = vision_feature_layers + + num_feature_layers = 1 if isinstance(vision_feature_layers, int) else len(vision_feature_layers) + hidden_size = config.vision_config.hidden_size + expected_features = hidden_size * num_feature_layers + + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + # We should have the right number of input features, + # and should be able to run a forward pass without exploding + assert model.multi_modal_projector.linear_1.in_features == expected_features + model(**input_dict) + @unittest.skip( reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" )