From 9c4639b62236b503d881c225246ad8ee61ae6ed8 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Fri, 13 Sep 2024 10:20:03 +0200 Subject: [PATCH] Return image hidden states (#33426) * fix * return image hidden states * fix copies * fix test --- .../models/llava/modeling_llava.py | 12 +++++------- .../models/llava_next/modeling_llava_next.py | 12 +++++------- .../modeling_llava_next_video.py | 17 ++++++++++------- .../modeling_llava_onevision.py | 18 +++++++++++------- .../models/paligemma/modeling_paligemma.py | 11 +++++------ .../models/video_llava/modeling_video_llava.py | 17 ++++++++++------- .../models/vipllava/modeling_vipllava.py | 13 ++++++------- .../video_llava/test_modeling_video_llava.py | 4 ++++ 8 files changed, 56 insertions(+), 48 deletions(-) diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index 94388af99e..9ad19ccee7 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -43,7 +43,6 @@ _CHECKPOINT_FOR_DOC = "llava-hf/llava-1.5-7b-hf" @dataclass -# Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->Llava class LlavaCausalLMOutputWithPast(ModelOutput): """ Base class for Llava causal language model (or autoregressive) outputs. @@ -70,11 +69,9 @@ class LlavaCausalLMOutputWithPast(ModelOutput): Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - image_hidden_states (`tuple(torch.FloatTensor)`, *optional*): - Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images, - sequence_length, hidden_size)`. - - image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. """ loss: Optional[torch.FloatTensor] = None @@ -82,7 +79,7 @@ class LlavaCausalLMOutputWithPast(ModelOutput): past_key_values: Optional[List[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None - image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None class LlavaMultiModalProjector(nn.Module): @@ -560,6 +557,7 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel): past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, ) def prepare_inputs_for_generation( diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 18a17c6dcd..ebb4da3102 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -144,7 +144,6 @@ def unpad_image(tensor, original_size): @dataclass -# Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->LlavaNext class LlavaNextCausalLMOutputWithPast(ModelOutput): """ Base class for LlavaNext causal language model (or autoregressive) outputs. @@ -171,11 +170,9 @@ class LlavaNextCausalLMOutputWithPast(ModelOutput): Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - image_hidden_states (`tuple(torch.FloatTensor)`, *optional*): - Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images, - sequence_length, hidden_size)`. - - image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. """ loss: Optional[torch.FloatTensor] = None @@ -183,7 +180,7 @@ class LlavaNextCausalLMOutputWithPast(ModelOutput): past_key_values: Optional[List[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None - image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None # Copied from transformers.models.llava.modeling_llava.LlavaMultiModalProjector with Llava->LlavaNext @@ -931,6 +928,7 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel): past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, ) def prepare_inputs_for_generation( diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 7d6776738c..589bf346ce 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -150,7 +150,6 @@ def unpad_image(tensor, original_size): @dataclass -# Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->LlavaNextVideo class LlavaNextVideoCausalLMOutputWithPast(ModelOutput): """ Base class for LlavaNextVideo causal language model (or autoregressive) outputs. @@ -177,11 +176,12 @@ class LlavaNextVideoCausalLMOutputWithPast(ModelOutput): Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - image_hidden_states (`tuple(torch.FloatTensor)`, *optional*): - Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images, - sequence_length, hidden_size)`. - - image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + video_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`. + video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. """ loss: Optional[torch.FloatTensor] = None @@ -189,7 +189,8 @@ class LlavaNextVideoCausalLMOutputWithPast(ModelOutput): past_key_values: Optional[List[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None - image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None + video_hidden_states: Optional[torch.FloatTensor] = None class LlavaNextVideoPooler(nn.Module): @@ -1015,6 +1016,8 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel): past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + video_hidden_states=video_features if pixel_values_videos is not None else None, ) def prepare_inputs_for_generation( diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index 9496c38570..697ea84fea 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -145,7 +145,7 @@ def unpad_image(tensor, original_size): @dataclass -# Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->LlavaOnevision +# Copied from transformers.models.llava_next_video.modeling_llava_next_video.LlavaNextVideoCausalLMOutputWithPast with LlavaNextVideo->LlavaOnevision class LlavaOnevisionCausalLMOutputWithPast(ModelOutput): """ Base class for LlavaOnevision causal language model (or autoregressive) outputs. @@ -172,11 +172,12 @@ class LlavaOnevisionCausalLMOutputWithPast(ModelOutput): Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - image_hidden_states (`tuple(torch.FloatTensor)`, *optional*): - Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images, - sequence_length, hidden_size)`. - - image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + video_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`. + video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. """ loss: Optional[torch.FloatTensor] = None @@ -184,7 +185,8 @@ class LlavaOnevisionCausalLMOutputWithPast(ModelOutput): past_key_values: Optional[List[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None - image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None + video_hidden_states: Optional[torch.FloatTensor] = None # Copied from transformers.models.llava.modeling_llava.LlavaMultiModalProjector with Llava->LlavaOnevision @@ -690,6 +692,8 @@ class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel): past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + video_hidden_states=video_features if pixel_values_videos is not None else None, ) def prepare_inputs_for_generation( diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py index 71bc7c3313..39ee57d70b 100644 --- a/src/transformers/models/paligemma/modeling_paligemma.py +++ b/src/transformers/models/paligemma/modeling_paligemma.py @@ -72,11 +72,9 @@ class PaliGemmaCausalLMOutputWithPast(ModelOutput): Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - image_hidden_states (`tuple(torch.FloatTensor)`, *optional*): - Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images, - sequence_length, hidden_size)`. - - image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder after projecting last hidden state. """ loss: Optional[torch.FloatTensor] = None @@ -84,7 +82,7 @@ class PaliGemmaCausalLMOutputWithPast(ModelOutput): past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None - image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None class PaliGemmaMultiModalProjector(nn.Module): @@ -488,6 +486,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel): past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, ) def prepare_inputs_for_generation( diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index 08e02d9a70..9ae80be65a 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -40,7 +40,6 @@ _CONFIG_FOR_DOC = "VideoLlavaConfig" @dataclass -# Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->VideoLlava class VideoLlavaCausalLMOutputWithPast(ModelOutput): """ Base class for VideoLlava causal language model (or autoregressive) outputs. @@ -67,11 +66,12 @@ class VideoLlavaCausalLMOutputWithPast(ModelOutput): Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - image_hidden_states (`tuple(torch.FloatTensor)`, *optional*): - Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images, - sequence_length, hidden_size)`. - - image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + video_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`. + video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. """ loss: Optional[torch.FloatTensor] = None @@ -79,7 +79,8 @@ class VideoLlavaCausalLMOutputWithPast(ModelOutput): past_key_values: Optional[List[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None - image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None + video_hidden_states: Optional[torch.FloatTensor] = None # Copied from transformers.models.llava.modeling_llava.LlavaMultiModalProjector with Llava->VideoLlava @@ -672,6 +673,8 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel): past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values_images is not None else None, + video_hidden_states=video_features if pixel_values_videos is not None else None, ) def prepare_inputs_for_generation( diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index 5367b1e088..53a3213697 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -40,7 +40,7 @@ _CONFIG_FOR_DOC = "VipLlavaConfig" @dataclass -# Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->VipLlava +# Copied from transformers.models.llava.modeling_llava.LlavaCausalLMOutputWithPast with Llava->VipLlava class VipLlavaCausalLMOutputWithPast(ModelOutput): """ Base class for VipLlava causal language model (or autoregressive) outputs. @@ -67,11 +67,9 @@ class VipLlavaCausalLMOutputWithPast(ModelOutput): Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - image_hidden_states (`tuple(torch.FloatTensor)`, *optional*): - Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images, - sequence_length, hidden_size)`. - - image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. """ loss: Optional[torch.FloatTensor] = None @@ -79,7 +77,7 @@ class VipLlavaCausalLMOutputWithPast(ModelOutput): past_key_values: Optional[List[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None - image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None class VipLlavaMultiModalProjector(nn.Module): @@ -554,6 +552,7 @@ class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel): past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, ) def prepare_inputs_for_generation( diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py index a8b2229a02..1f88834537 100644 --- a/tests/models/video_llava/test_modeling_video_llava.py +++ b/tests/models/video_llava/test_modeling_video_llava.py @@ -320,6 +320,10 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe model_row_output = model(**single_row_input) for key in model_batched_output: + # we can't test videos as their output shapes are linked to number of frames + # and we don't have to as it is a CLIP model and can be tested from `ClipModelTester` class + if key == "video_hidden_states": + continue recursive_check(model_batched_output[key], model_row_output[key], model_name, key) # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs