Return image hidden states (#33426)
* fix * return image hidden states * fix copies * fix test
This commit is contained in:
committed by
GitHub
parent
a05ce550bf
commit
9c4639b622
@@ -43,7 +43,6 @@ _CHECKPOINT_FOR_DOC = "llava-hf/llava-1.5-7b-hf"
|
|||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
# Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->Llava
|
|
||||||
class LlavaCausalLMOutputWithPast(ModelOutput):
|
class LlavaCausalLMOutputWithPast(ModelOutput):
|
||||||
"""
|
"""
|
||||||
Base class for Llava causal language model (or autoregressive) outputs.
|
Base class for Llava causal language model (or autoregressive) outputs.
|
||||||
@@ -70,11 +69,9 @@ class LlavaCausalLMOutputWithPast(ModelOutput):
|
|||||||
|
|
||||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||||
heads.
|
heads.
|
||||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
|
||||||
sequence_length, hidden_size)`.
|
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||||
|
|
||||||
image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
loss: Optional[torch.FloatTensor] = None
|
loss: Optional[torch.FloatTensor] = None
|
||||||
@@ -82,7 +79,7 @@ class LlavaCausalLMOutputWithPast(ModelOutput):
|
|||||||
past_key_values: Optional[List[torch.FloatTensor]] = None
|
past_key_values: Optional[List[torch.FloatTensor]] = None
|
||||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||||
image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
image_hidden_states: Optional[torch.FloatTensor] = None
|
||||||
|
|
||||||
|
|
||||||
class LlavaMultiModalProjector(nn.Module):
|
class LlavaMultiModalProjector(nn.Module):
|
||||||
@@ -560,6 +557,7 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel):
|
|||||||
past_key_values=outputs.past_key_values,
|
past_key_values=outputs.past_key_values,
|
||||||
hidden_states=outputs.hidden_states,
|
hidden_states=outputs.hidden_states,
|
||||||
attentions=outputs.attentions,
|
attentions=outputs.attentions,
|
||||||
|
image_hidden_states=image_features if pixel_values is not None else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
def prepare_inputs_for_generation(
|
def prepare_inputs_for_generation(
|
||||||
|
|||||||
@@ -144,7 +144,6 @@ def unpad_image(tensor, original_size):
|
|||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
# Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->LlavaNext
|
|
||||||
class LlavaNextCausalLMOutputWithPast(ModelOutput):
|
class LlavaNextCausalLMOutputWithPast(ModelOutput):
|
||||||
"""
|
"""
|
||||||
Base class for LlavaNext causal language model (or autoregressive) outputs.
|
Base class for LlavaNext causal language model (or autoregressive) outputs.
|
||||||
@@ -171,11 +170,9 @@ class LlavaNextCausalLMOutputWithPast(ModelOutput):
|
|||||||
|
|
||||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||||
heads.
|
heads.
|
||||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`.
|
||||||
sequence_length, hidden_size)`.
|
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||||
|
|
||||||
image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
loss: Optional[torch.FloatTensor] = None
|
loss: Optional[torch.FloatTensor] = None
|
||||||
@@ -183,7 +180,7 @@ class LlavaNextCausalLMOutputWithPast(ModelOutput):
|
|||||||
past_key_values: Optional[List[torch.FloatTensor]] = None
|
past_key_values: Optional[List[torch.FloatTensor]] = None
|
||||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||||
image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
image_hidden_states: Optional[torch.FloatTensor] = None
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.llava.modeling_llava.LlavaMultiModalProjector with Llava->LlavaNext
|
# Copied from transformers.models.llava.modeling_llava.LlavaMultiModalProjector with Llava->LlavaNext
|
||||||
@@ -931,6 +928,7 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel):
|
|||||||
past_key_values=outputs.past_key_values,
|
past_key_values=outputs.past_key_values,
|
||||||
hidden_states=outputs.hidden_states,
|
hidden_states=outputs.hidden_states,
|
||||||
attentions=outputs.attentions,
|
attentions=outputs.attentions,
|
||||||
|
image_hidden_states=image_features if pixel_values is not None else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
def prepare_inputs_for_generation(
|
def prepare_inputs_for_generation(
|
||||||
|
|||||||
@@ -150,7 +150,6 @@ def unpad_image(tensor, original_size):
|
|||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
# Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->LlavaNextVideo
|
|
||||||
class LlavaNextVideoCausalLMOutputWithPast(ModelOutput):
|
class LlavaNextVideoCausalLMOutputWithPast(ModelOutput):
|
||||||
"""
|
"""
|
||||||
Base class for LlavaNextVideo causal language model (or autoregressive) outputs.
|
Base class for LlavaNextVideo causal language model (or autoregressive) outputs.
|
||||||
@@ -177,11 +176,12 @@ class LlavaNextVideoCausalLMOutputWithPast(ModelOutput):
|
|||||||
|
|
||||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||||
heads.
|
heads.
|
||||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`.
|
||||||
sequence_length, hidden_size)`.
|
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||||
|
video_hidden_states (`torch.FloatTensor`, *optional*):
|
||||||
image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
|
A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
|
||||||
|
video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
loss: Optional[torch.FloatTensor] = None
|
loss: Optional[torch.FloatTensor] = None
|
||||||
@@ -189,7 +189,8 @@ class LlavaNextVideoCausalLMOutputWithPast(ModelOutput):
|
|||||||
past_key_values: Optional[List[torch.FloatTensor]] = None
|
past_key_values: Optional[List[torch.FloatTensor]] = None
|
||||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||||
image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
image_hidden_states: Optional[torch.FloatTensor] = None
|
||||||
|
video_hidden_states: Optional[torch.FloatTensor] = None
|
||||||
|
|
||||||
|
|
||||||
class LlavaNextVideoPooler(nn.Module):
|
class LlavaNextVideoPooler(nn.Module):
|
||||||
@@ -1015,6 +1016,8 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel):
|
|||||||
past_key_values=outputs.past_key_values,
|
past_key_values=outputs.past_key_values,
|
||||||
hidden_states=outputs.hidden_states,
|
hidden_states=outputs.hidden_states,
|
||||||
attentions=outputs.attentions,
|
attentions=outputs.attentions,
|
||||||
|
image_hidden_states=image_features if pixel_values is not None else None,
|
||||||
|
video_hidden_states=video_features if pixel_values_videos is not None else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
def prepare_inputs_for_generation(
|
def prepare_inputs_for_generation(
|
||||||
|
|||||||
@@ -145,7 +145,7 @@ def unpad_image(tensor, original_size):
|
|||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
# Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->LlavaOnevision
|
# Copied from transformers.models.llava_next_video.modeling_llava_next_video.LlavaNextVideoCausalLMOutputWithPast with LlavaNextVideo->LlavaOnevision
|
||||||
class LlavaOnevisionCausalLMOutputWithPast(ModelOutput):
|
class LlavaOnevisionCausalLMOutputWithPast(ModelOutput):
|
||||||
"""
|
"""
|
||||||
Base class for LlavaOnevision causal language model (or autoregressive) outputs.
|
Base class for LlavaOnevision causal language model (or autoregressive) outputs.
|
||||||
@@ -172,11 +172,12 @@ class LlavaOnevisionCausalLMOutputWithPast(ModelOutput):
|
|||||||
|
|
||||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||||
heads.
|
heads.
|
||||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`.
|
||||||
sequence_length, hidden_size)`.
|
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||||
|
video_hidden_states (`torch.FloatTensor`, *optional*):
|
||||||
image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
|
A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
|
||||||
|
video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
loss: Optional[torch.FloatTensor] = None
|
loss: Optional[torch.FloatTensor] = None
|
||||||
@@ -184,7 +185,8 @@ class LlavaOnevisionCausalLMOutputWithPast(ModelOutput):
|
|||||||
past_key_values: Optional[List[torch.FloatTensor]] = None
|
past_key_values: Optional[List[torch.FloatTensor]] = None
|
||||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||||
image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
image_hidden_states: Optional[torch.FloatTensor] = None
|
||||||
|
video_hidden_states: Optional[torch.FloatTensor] = None
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.llava.modeling_llava.LlavaMultiModalProjector with Llava->LlavaOnevision
|
# Copied from transformers.models.llava.modeling_llava.LlavaMultiModalProjector with Llava->LlavaOnevision
|
||||||
@@ -690,6 +692,8 @@ class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel):
|
|||||||
past_key_values=outputs.past_key_values,
|
past_key_values=outputs.past_key_values,
|
||||||
hidden_states=outputs.hidden_states,
|
hidden_states=outputs.hidden_states,
|
||||||
attentions=outputs.attentions,
|
attentions=outputs.attentions,
|
||||||
|
image_hidden_states=image_features if pixel_values is not None else None,
|
||||||
|
video_hidden_states=video_features if pixel_values_videos is not None else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
def prepare_inputs_for_generation(
|
def prepare_inputs_for_generation(
|
||||||
|
|||||||
@@ -72,11 +72,9 @@ class PaliGemmaCausalLMOutputWithPast(ModelOutput):
|
|||||||
|
|
||||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||||
heads.
|
heads.
|
||||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||||
sequence_length, hidden_size)`.
|
image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
|
||||||
|
|
||||||
image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
loss: Optional[torch.FloatTensor] = None
|
loss: Optional[torch.FloatTensor] = None
|
||||||
@@ -84,7 +82,7 @@ class PaliGemmaCausalLMOutputWithPast(ModelOutput):
|
|||||||
past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None
|
past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None
|
||||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||||
image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
image_hidden_states: Optional[torch.FloatTensor] = None
|
||||||
|
|
||||||
|
|
||||||
class PaliGemmaMultiModalProjector(nn.Module):
|
class PaliGemmaMultiModalProjector(nn.Module):
|
||||||
@@ -488,6 +486,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel):
|
|||||||
past_key_values=outputs.past_key_values,
|
past_key_values=outputs.past_key_values,
|
||||||
hidden_states=outputs.hidden_states,
|
hidden_states=outputs.hidden_states,
|
||||||
attentions=outputs.attentions,
|
attentions=outputs.attentions,
|
||||||
|
image_hidden_states=image_features if pixel_values is not None else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
def prepare_inputs_for_generation(
|
def prepare_inputs_for_generation(
|
||||||
|
|||||||
@@ -40,7 +40,6 @@ _CONFIG_FOR_DOC = "VideoLlavaConfig"
|
|||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
# Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->VideoLlava
|
|
||||||
class VideoLlavaCausalLMOutputWithPast(ModelOutput):
|
class VideoLlavaCausalLMOutputWithPast(ModelOutput):
|
||||||
"""
|
"""
|
||||||
Base class for VideoLlava causal language model (or autoregressive) outputs.
|
Base class for VideoLlava causal language model (or autoregressive) outputs.
|
||||||
@@ -67,11 +66,12 @@ class VideoLlavaCausalLMOutputWithPast(ModelOutput):
|
|||||||
|
|
||||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||||
heads.
|
heads.
|
||||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
|
||||||
sequence_length, hidden_size)`.
|
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||||
|
video_hidden_states (`torch.FloatTensor`, *optional*):
|
||||||
image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
|
A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
|
||||||
|
video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
loss: Optional[torch.FloatTensor] = None
|
loss: Optional[torch.FloatTensor] = None
|
||||||
@@ -79,7 +79,8 @@ class VideoLlavaCausalLMOutputWithPast(ModelOutput):
|
|||||||
past_key_values: Optional[List[torch.FloatTensor]] = None
|
past_key_values: Optional[List[torch.FloatTensor]] = None
|
||||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||||
image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
image_hidden_states: Optional[torch.FloatTensor] = None
|
||||||
|
video_hidden_states: Optional[torch.FloatTensor] = None
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.llava.modeling_llava.LlavaMultiModalProjector with Llava->VideoLlava
|
# Copied from transformers.models.llava.modeling_llava.LlavaMultiModalProjector with Llava->VideoLlava
|
||||||
@@ -672,6 +673,8 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel):
|
|||||||
past_key_values=outputs.past_key_values,
|
past_key_values=outputs.past_key_values,
|
||||||
hidden_states=outputs.hidden_states,
|
hidden_states=outputs.hidden_states,
|
||||||
attentions=outputs.attentions,
|
attentions=outputs.attentions,
|
||||||
|
image_hidden_states=image_features if pixel_values_images is not None else None,
|
||||||
|
video_hidden_states=video_features if pixel_values_videos is not None else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
def prepare_inputs_for_generation(
|
def prepare_inputs_for_generation(
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ _CONFIG_FOR_DOC = "VipLlavaConfig"
|
|||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
# Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->VipLlava
|
# Copied from transformers.models.llava.modeling_llava.LlavaCausalLMOutputWithPast with Llava->VipLlava
|
||||||
class VipLlavaCausalLMOutputWithPast(ModelOutput):
|
class VipLlavaCausalLMOutputWithPast(ModelOutput):
|
||||||
"""
|
"""
|
||||||
Base class for VipLlava causal language model (or autoregressive) outputs.
|
Base class for VipLlava causal language model (or autoregressive) outputs.
|
||||||
@@ -67,11 +67,9 @@ class VipLlavaCausalLMOutputWithPast(ModelOutput):
|
|||||||
|
|
||||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||||
heads.
|
heads.
|
||||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
|
||||||
sequence_length, hidden_size)`.
|
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||||
|
|
||||||
image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
loss: Optional[torch.FloatTensor] = None
|
loss: Optional[torch.FloatTensor] = None
|
||||||
@@ -79,7 +77,7 @@ class VipLlavaCausalLMOutputWithPast(ModelOutput):
|
|||||||
past_key_values: Optional[List[torch.FloatTensor]] = None
|
past_key_values: Optional[List[torch.FloatTensor]] = None
|
||||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||||
image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
image_hidden_states: Optional[torch.FloatTensor] = None
|
||||||
|
|
||||||
|
|
||||||
class VipLlavaMultiModalProjector(nn.Module):
|
class VipLlavaMultiModalProjector(nn.Module):
|
||||||
@@ -554,6 +552,7 @@ class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel):
|
|||||||
past_key_values=outputs.past_key_values,
|
past_key_values=outputs.past_key_values,
|
||||||
hidden_states=outputs.hidden_states,
|
hidden_states=outputs.hidden_states,
|
||||||
attentions=outputs.attentions,
|
attentions=outputs.attentions,
|
||||||
|
image_hidden_states=image_features if pixel_values is not None else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
def prepare_inputs_for_generation(
|
def prepare_inputs_for_generation(
|
||||||
|
|||||||
@@ -320,6 +320,10 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
|
|||||||
model_row_output = model(**single_row_input)
|
model_row_output = model(**single_row_input)
|
||||||
|
|
||||||
for key in model_batched_output:
|
for key in model_batched_output:
|
||||||
|
# we can't test videos as their output shapes are linked to number of frames
|
||||||
|
# and we don't have to as it is a CLIP model and can be tested from `ClipModelTester` class
|
||||||
|
if key == "video_hidden_states":
|
||||||
|
continue
|
||||||
recursive_check(model_batched_output[key], model_row_output[key], model_name, key)
|
recursive_check(model_batched_output[key], model_row_output[key], model_name, key)
|
||||||
|
|
||||||
# overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
|
# overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
|
||||||
|
|||||||
Reference in New Issue
Block a user