committed by
GitHub
parent
7176e06b52
commit
d1681ec2b6
@@ -461,18 +461,9 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin):
|
|||||||
"You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
|
"You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
|
||||||
)
|
)
|
||||||
|
|
||||||
legacy_processing = False
|
|
||||||
if inputs_embeds is None:
|
if inputs_embeds is None:
|
||||||
inputs_embeds = self.get_input_embeddings()(input_ids)
|
inputs_embeds = self.get_input_embeddings()(input_ids)
|
||||||
|
|
||||||
# if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
|
|
||||||
# not very reliable, but we don't expect one to actually pass 500+ images for one prompt
|
|
||||||
# In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
|
|
||||||
legacy_processing = (
|
|
||||||
(input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
|
|
||||||
) or (input_ids.shape[-1] == 1 and pixel_values is not None)
|
|
||||||
|
|
||||||
image_features = None
|
|
||||||
if pixel_values is not None:
|
if pixel_values is not None:
|
||||||
image_features = self.get_image_features(
|
image_features = self.get_image_features(
|
||||||
pixel_values=pixel_values,
|
pixel_values=pixel_values,
|
||||||
@@ -480,66 +471,14 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin):
|
|||||||
vision_feature_select_strategy=vision_feature_select_strategy,
|
vision_feature_select_strategy=vision_feature_select_strategy,
|
||||||
)
|
)
|
||||||
|
|
||||||
if legacy_processing:
|
|
||||||
logger.warning_once(
|
|
||||||
"Expanding inputs for image tokens in LLaVa should be done in processing. "
|
|
||||||
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
|
|
||||||
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
|
|
||||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
|
|
||||||
)
|
|
||||||
# prefill stage vs decoding stage (legacy behavior copied)
|
|
||||||
if input_ids.shape[1] != 1:
|
|
||||||
inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
|
|
||||||
image_features, inputs_embeds, input_ids, attention_mask, labels
|
|
||||||
)
|
|
||||||
cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
|
|
||||||
else:
|
|
||||||
# Retrieve the first layer to inspect the logits and mask out the hidden states
|
|
||||||
# that are set to 0
|
|
||||||
first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
|
|
||||||
|
|
||||||
# Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
|
|
||||||
batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
|
|
||||||
|
|
||||||
# Get the target length
|
|
||||||
target_length = input_ids.shape[1]
|
|
||||||
past_length = first_layer_past_key_value.shape[-1]
|
|
||||||
|
|
||||||
extended_attention_mask = torch.ones(
|
|
||||||
(attention_mask.shape[0], past_length),
|
|
||||||
dtype=attention_mask.dtype,
|
|
||||||
device=attention_mask.device,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Filter out only the tokens that can be un-attended, this can happen
|
|
||||||
# if one uses Llava + Fused modules where the cache on the
|
|
||||||
# first iteration is already big enough, or if one passes custom cache
|
|
||||||
valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
|
|
||||||
new_batch_index = batch_index[valid_indices]
|
|
||||||
new_non_attended_tokens = non_attended_tokens[valid_indices]
|
|
||||||
|
|
||||||
# Zero-out the places where we don't need to attend
|
|
||||||
extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
|
|
||||||
|
|
||||||
attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
|
|
||||||
position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
|
|
||||||
cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:]
|
|
||||||
|
|
||||||
# TODO: @raushan retain only the new behavior after v4.47
|
|
||||||
elif image_features is not None:
|
|
||||||
n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
|
n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
|
||||||
n_image_features = image_features.shape[0] * image_features.shape[1]
|
n_image_features = image_features.shape[0] * image_features.shape[1]
|
||||||
|
|
||||||
if n_image_tokens != n_image_features:
|
if n_image_tokens != n_image_features:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
|
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
|
||||||
)
|
)
|
||||||
special_image_mask = (
|
special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
|
||||||
(input_ids == self.config.image_token_index)
|
special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
|
||||||
.unsqueeze(-1)
|
|
||||||
.expand_as(inputs_embeds)
|
|
||||||
.to(inputs_embeds.device)
|
|
||||||
)
|
|
||||||
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
||||||
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
|
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
|
||||||
|
|
||||||
|
|||||||
@@ -154,27 +154,17 @@ class LlavaProcessor(ProcessorMixin):
|
|||||||
# try to expand inputs in processing if we have the necessary parts
|
# try to expand inputs in processing if we have the necessary parts
|
||||||
prompt_strings = text
|
prompt_strings = text
|
||||||
if image_inputs.get("pixel_values") is not None:
|
if image_inputs.get("pixel_values") is not None:
|
||||||
if self.patch_size is not None and self.vision_feature_select_strategy is not None:
|
# Replace the image token with the expanded image token sequence
|
||||||
# Replace the image token with the expanded image token sequence
|
pixel_values = image_inputs["pixel_values"]
|
||||||
pixel_values = image_inputs["pixel_values"]
|
height, width = get_image_size(to_numpy_array(pixel_values[0]))
|
||||||
height, width = get_image_size(to_numpy_array(pixel_values[0]))
|
num_image_tokens = (height // self.patch_size) * (width // self.patch_size) + 1
|
||||||
num_image_tokens = (height // self.patch_size) * (
|
if self.vision_feature_select_strategy == "default":
|
||||||
width // self.patch_size
|
num_image_tokens -= 1
|
||||||
) + self.num_additional_image_tokens
|
|
||||||
if self.vision_feature_select_strategy == "default":
|
|
||||||
num_image_tokens -= self.num_additional_image_tokens
|
|
||||||
|
|
||||||
prompt_strings = []
|
prompt_strings = []
|
||||||
for sample in text:
|
for sample in text:
|
||||||
sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
|
sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
|
||||||
prompt_strings.append(sample)
|
prompt_strings.append(sample)
|
||||||
else:
|
|
||||||
logger.warning_once(
|
|
||||||
"Expanding inputs for image tokens in LLaVa should be done in processing. "
|
|
||||||
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
|
|
||||||
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
|
|
||||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
|
|
||||||
)
|
|
||||||
|
|
||||||
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
|
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
|
||||||
return BatchFeature(data={**text_inputs, **image_inputs})
|
return BatchFeature(data={**text_inputs, **image_inputs})
|
||||||
|
|||||||
@@ -689,7 +689,9 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi
|
|||||||
image_feature = torch.cat(
|
image_feature = torch.cat(
|
||||||
(
|
(
|
||||||
image_feature,
|
image_feature,
|
||||||
image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.dtype),
|
image_newline[:, None, None]
|
||||||
|
.expand(*image_feature.shape[:-1], 1)
|
||||||
|
.to(image_feature.device, image_feature.dtype),
|
||||||
),
|
),
|
||||||
dim=-1,
|
dim=-1,
|
||||||
)
|
)
|
||||||
@@ -835,18 +837,9 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi
|
|||||||
"You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
|
"You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
|
||||||
)
|
)
|
||||||
|
|
||||||
legacy_processing = False
|
|
||||||
if inputs_embeds is None:
|
if inputs_embeds is None:
|
||||||
inputs_embeds = self.get_input_embeddings()(input_ids)
|
inputs_embeds = self.get_input_embeddings()(input_ids)
|
||||||
|
|
||||||
# if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
|
|
||||||
# not very reliable, but we don't expect one to actually pass 500+ images for one prompt
|
|
||||||
# In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
|
|
||||||
legacy_processing = (
|
|
||||||
(input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
|
|
||||||
) or (input_ids.shape[-1] == 1 and pixel_values is not None)
|
|
||||||
|
|
||||||
image_features = None
|
|
||||||
if pixel_values is not None and pixel_values.size(0) > 0:
|
if pixel_values is not None and pixel_values.size(0) > 0:
|
||||||
image_features = self.get_image_features(
|
image_features = self.get_image_features(
|
||||||
pixel_values,
|
pixel_values,
|
||||||
@@ -863,70 +856,14 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi
|
|||||||
image_newline=self.image_newline,
|
image_newline=self.image_newline,
|
||||||
)
|
)
|
||||||
|
|
||||||
if legacy_processing:
|
|
||||||
logger.warning_once(
|
|
||||||
"Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
|
|
||||||
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
|
|
||||||
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
|
|
||||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
|
|
||||||
)
|
|
||||||
if input_ids.shape[1] != 1:
|
|
||||||
inputs_embeds = inputs_embeds.to(image_features.dtype)
|
|
||||||
inputs_embeds, attention_mask, position_ids, labels, _ = self._merge_input_ids_with_image_features(
|
|
||||||
image_features,
|
|
||||||
feature_lens,
|
|
||||||
inputs_embeds,
|
|
||||||
input_ids,
|
|
||||||
attention_mask,
|
|
||||||
position_ids,
|
|
||||||
labels=labels,
|
|
||||||
)
|
|
||||||
cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
|
|
||||||
else:
|
|
||||||
# Retrieve the first layer to inspect the logits and mask out the hidden states
|
|
||||||
# that are set to 0
|
|
||||||
first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
|
|
||||||
|
|
||||||
# Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
|
|
||||||
batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
|
|
||||||
|
|
||||||
# Get the target length
|
|
||||||
target_length = input_ids.shape[1]
|
|
||||||
past_length = first_layer_past_key_value.shape[-1]
|
|
||||||
|
|
||||||
extended_attention_mask = torch.ones(
|
|
||||||
(attention_mask.shape[0], past_length),
|
|
||||||
dtype=attention_mask.dtype,
|
|
||||||
device=attention_mask.device,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Filter out only the tokens that can be un-attended, this can happen
|
|
||||||
# if one uses Llava + Fused modules where the cache on the
|
|
||||||
# first iteration is already big enough, or if one passes custom cache
|
|
||||||
valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
|
|
||||||
new_batch_index = batch_index[valid_indices]
|
|
||||||
new_non_attended_tokens = non_attended_tokens[valid_indices]
|
|
||||||
|
|
||||||
# Zero-out the places where we don't need to attend
|
|
||||||
extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
|
|
||||||
attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
|
|
||||||
position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
|
|
||||||
cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:]
|
|
||||||
|
|
||||||
# TODO: @raushan retain only the new behavior after v4.47
|
|
||||||
elif image_features is not None:
|
|
||||||
n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
|
n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
|
||||||
n_image_features = image_features.shape[0]
|
n_image_features = image_features.shape[0]
|
||||||
if n_image_tokens != n_image_features:
|
if n_image_tokens != n_image_features:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
|
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
|
||||||
)
|
)
|
||||||
special_image_mask = (
|
special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
|
||||||
(input_ids == self.config.image_token_index)
|
special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
|
||||||
.unsqueeze(-1)
|
|
||||||
.expand_as(inputs_embeds)
|
|
||||||
.to(inputs_embeds.device)
|
|
||||||
)
|
|
||||||
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
||||||
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
|
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
|
||||||
|
|
||||||
|
|||||||
@@ -149,30 +149,19 @@ class LlavaNextProcessor(ProcessorMixin):
|
|||||||
|
|
||||||
prompt_strings = text
|
prompt_strings = text
|
||||||
if image_inputs:
|
if image_inputs:
|
||||||
if self.patch_size is None or self.vision_feature_select_strategy is None:
|
image_sizes = iter(image_inputs["image_sizes"])
|
||||||
logger.warning_once(
|
height, width = get_image_size(to_numpy_array(image_inputs["pixel_values"][0][0]))
|
||||||
"Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
|
prompt_strings = []
|
||||||
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
|
for sample in text:
|
||||||
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
|
while self.image_token in sample:
|
||||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
|
image_size = next(image_sizes)
|
||||||
)
|
orig_height, orig_width = image_size
|
||||||
else:
|
num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
|
||||||
image_sizes = iter(image_inputs["image_sizes"])
|
if self.vision_feature_select_strategy == "default":
|
||||||
height, width = get_image_size(to_numpy_array(image_inputs["pixel_values"][0][0]))
|
num_image_tokens -= 1
|
||||||
prompt_strings = []
|
sample = sample.replace(self.image_token, "<placeholder>" * num_image_tokens, 1)
|
||||||
for sample in text:
|
prompt_strings.append(sample)
|
||||||
while self.image_token in sample:
|
prompt_strings = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
|
||||||
image_size = next(image_sizes)
|
|
||||||
if not isinstance(image_size, (list, tuple)):
|
|
||||||
# cast to list to avoid numerical precision errors when calculating unpadding
|
|
||||||
image_size = image_size.tolist()
|
|
||||||
orig_height, orig_width = image_size
|
|
||||||
num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
|
|
||||||
if self.vision_feature_select_strategy == "default":
|
|
||||||
num_image_tokens -= self.num_additional_image_tokens
|
|
||||||
sample = sample.replace(self.image_token, "<placeholder>" * num_image_tokens, 1)
|
|
||||||
prompt_strings.append(sample)
|
|
||||||
prompt_strings = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
|
|
||||||
|
|
||||||
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
|
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
|
||||||
|
|
||||||
|
|||||||
@@ -722,7 +722,9 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, Gene
|
|||||||
image_feature = torch.cat(
|
image_feature = torch.cat(
|
||||||
(
|
(
|
||||||
image_feature,
|
image_feature,
|
||||||
image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.dtype),
|
image_newline[:, None, None]
|
||||||
|
.expand(*image_feature.shape[:-1], 1)
|
||||||
|
.to(image_feature.device, image_feature.dtype),
|
||||||
),
|
),
|
||||||
dim=-1,
|
dim=-1,
|
||||||
)
|
)
|
||||||
@@ -909,25 +911,9 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, Gene
|
|||||||
"and must specify either one"
|
"and must specify either one"
|
||||||
)
|
)
|
||||||
|
|
||||||
legacy_processing = False
|
|
||||||
if inputs_embeds is None:
|
if inputs_embeds is None:
|
||||||
inputs_embeds = self.get_input_embeddings()(input_ids)
|
inputs_embeds = self.get_input_embeddings()(input_ids)
|
||||||
|
|
||||||
# if the number of image/video tokens is more than image embeddings seq length, then prob we expanded it in processing
|
|
||||||
# not very reliable, but we don't expect one to actually pass 500+ images for one prompt
|
|
||||||
img_token_not_enough = (input_ids == self.config.image_token_index).sum(
|
|
||||||
1
|
|
||||||
).max() < self.config.image_seq_length
|
|
||||||
video_token_not_enough = (input_ids == self.config.video_token_index).sum(
|
|
||||||
1
|
|
||||||
).max() < self.config.video_seq_length
|
|
||||||
inputs_not_expanded = (img_token_not_enough and pixel_values is not None) or (
|
|
||||||
video_token_not_enough and pixel_values_videos is not None
|
|
||||||
)
|
|
||||||
pixels_present = input_ids.shape[-1] == 1 and (pixel_values is not None or pixel_values_videos is not None)
|
|
||||||
legacy_processing = inputs_not_expanded or pixels_present
|
|
||||||
|
|
||||||
image_features = feature_lens = None
|
|
||||||
if pixel_values is not None and pixel_values.size(0) > 0:
|
if pixel_values is not None and pixel_values.size(0) > 0:
|
||||||
image_features = self.get_image_features(
|
image_features = self.get_image_features(
|
||||||
pixel_values,
|
pixel_values,
|
||||||
@@ -942,7 +928,17 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, Gene
|
|||||||
image_newline=self.image_newline,
|
image_newline=self.image_newline,
|
||||||
)
|
)
|
||||||
|
|
||||||
video_features = video_feature_lens = None
|
n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
|
||||||
|
n_image_features = image_features.shape[0]
|
||||||
|
if n_image_tokens != n_image_features:
|
||||||
|
raise ValueError(
|
||||||
|
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
|
||||||
|
)
|
||||||
|
special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
|
||||||
|
special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
|
||||||
|
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
||||||
|
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
|
||||||
|
|
||||||
if pixel_values_videos is not None and pixel_values_videos.size(0) > 0:
|
if pixel_values_videos is not None and pixel_values_videos.size(0) > 0:
|
||||||
video_features = self.get_video_features(
|
video_features = self.get_video_features(
|
||||||
pixel_values_videos,
|
pixel_values_videos,
|
||||||
@@ -954,95 +950,16 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, Gene
|
|||||||
video_features = torch.cat(video_features, dim=0)
|
video_features = torch.cat(video_features, dim=0)
|
||||||
video_feature_lens = torch.tensor(video_feature_lens, dtype=torch.long, device=video_features.device)
|
video_feature_lens = torch.tensor(video_feature_lens, dtype=torch.long, device=video_features.device)
|
||||||
|
|
||||||
if legacy_processing:
|
n_video_tokens = (input_ids == self.config.video_token_index).sum().item()
|
||||||
logger.warning_once(
|
n_video_features = video_features.shape[0]
|
||||||
"Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. "
|
if n_video_tokens != n_video_features:
|
||||||
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
|
raise ValueError(
|
||||||
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
|
f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
|
||||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
|
|
||||||
)
|
|
||||||
if input_ids.shape[1] != 1:
|
|
||||||
iterator = (
|
|
||||||
(image_features, feature_lens, self.config.image_token_index),
|
|
||||||
(video_features, video_feature_lens, self.config.video_token_index),
|
|
||||||
)
|
)
|
||||||
for features, lens, special_token in iterator:
|
special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1)
|
||||||
if features is not None:
|
special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
|
||||||
(
|
video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
||||||
inputs_embeds,
|
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
|
||||||
attention_mask,
|
|
||||||
position_ids,
|
|
||||||
labels,
|
|
||||||
input_ids,
|
|
||||||
) = self._merge_input_ids_with_image_features(
|
|
||||||
features,
|
|
||||||
lens,
|
|
||||||
inputs_embeds,
|
|
||||||
input_ids,
|
|
||||||
attention_mask,
|
|
||||||
position_ids,
|
|
||||||
labels=labels,
|
|
||||||
image_token_index=special_token,
|
|
||||||
)
|
|
||||||
cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
|
|
||||||
else:
|
|
||||||
# Retrieve the first layer to inspect the logits and mask out the hidden states that are set to 0
|
|
||||||
first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
|
|
||||||
# Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
|
|
||||||
batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
|
|
||||||
# Get the target length
|
|
||||||
target_length = input_ids.shape[1]
|
|
||||||
past_length = first_layer_past_key_value.shape[-1]
|
|
||||||
extended_attention_mask = torch.ones(
|
|
||||||
(attention_mask.shape[0], past_length),
|
|
||||||
dtype=attention_mask.dtype,
|
|
||||||
device=attention_mask.device,
|
|
||||||
)
|
|
||||||
# Filter out only the tokens that can be un-attended, this can happen
|
|
||||||
# if one uses Llava + Fused modules where the cache on the
|
|
||||||
# first iteration is already big enough, or if one passes custom cache
|
|
||||||
valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
|
|
||||||
new_batch_index = batch_index[valid_indices]
|
|
||||||
new_non_attended_tokens = non_attended_tokens[valid_indices]
|
|
||||||
# Zero-out the places where we don't need to attend
|
|
||||||
extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
|
|
||||||
attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
|
|
||||||
position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
|
|
||||||
cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:]
|
|
||||||
|
|
||||||
# TODO: @raushan retain only the new behavior after v4.47
|
|
||||||
else:
|
|
||||||
if image_features is not None:
|
|
||||||
n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
|
|
||||||
n_image_features = image_features.shape[0]
|
|
||||||
|
|
||||||
if n_image_tokens != n_image_features:
|
|
||||||
raise ValueError(
|
|
||||||
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
|
|
||||||
)
|
|
||||||
special_image_mask = (
|
|
||||||
(input_ids == self.config.image_token_index)
|
|
||||||
.unsqueeze(-1)
|
|
||||||
.expand_as(inputs_embeds)
|
|
||||||
.to(inputs_embeds.device)
|
|
||||||
)
|
|
||||||
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
|
||||||
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
|
|
||||||
if video_features is not None:
|
|
||||||
n_video_tokens = (input_ids == self.config.video_token_index).sum().item()
|
|
||||||
n_video_features = video_features.shape[0]
|
|
||||||
if n_video_tokens != n_video_features:
|
|
||||||
raise ValueError(
|
|
||||||
f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
|
|
||||||
)
|
|
||||||
special_image_mask = (
|
|
||||||
(input_ids == self.config.video_token_index)
|
|
||||||
.unsqueeze(-1)
|
|
||||||
.expand_as(inputs_embeds)
|
|
||||||
.to(inputs_embeds.device)
|
|
||||||
)
|
|
||||||
video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
|
||||||
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
|
|
||||||
|
|
||||||
outputs = self.language_model(
|
outputs = self.language_model(
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
|
|||||||
@@ -431,25 +431,9 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration):
|
|||||||
"and must specify either one"
|
"and must specify either one"
|
||||||
)
|
)
|
||||||
|
|
||||||
legacy_processing = False
|
|
||||||
if inputs_embeds is None:
|
if inputs_embeds is None:
|
||||||
inputs_embeds = self.get_input_embeddings()(input_ids)
|
inputs_embeds = self.get_input_embeddings()(input_ids)
|
||||||
|
|
||||||
# if the number of image/video tokens is more than image embeddings seq length, then prob we expanded it in processing
|
|
||||||
# not very reliable, but we don't expect one to actually pass 500+ images for one prompt
|
|
||||||
img_token_not_enough = (input_ids == self.config.image_token_index).sum(
|
|
||||||
1
|
|
||||||
).max() < self.config.image_seq_length
|
|
||||||
video_token_not_enough = (input_ids == self.config.video_token_index).sum(
|
|
||||||
1
|
|
||||||
).max() < self.config.video_seq_length
|
|
||||||
inputs_not_expanded = (img_token_not_enough and pixel_values is not None) or (
|
|
||||||
video_token_not_enough and pixel_values_videos is not None
|
|
||||||
)
|
|
||||||
pixels_present = input_ids.shape[-1] == 1 and (pixel_values is not None or pixel_values_videos is not None)
|
|
||||||
legacy_processing = inputs_not_expanded or pixels_present
|
|
||||||
|
|
||||||
image_features = feature_lens = None
|
|
||||||
if pixel_values is not None and pixel_values.size(0) > 0:
|
if pixel_values is not None and pixel_values.size(0) > 0:
|
||||||
image_features = self.get_image_features(
|
image_features = self.get_image_features(
|
||||||
pixel_values,
|
pixel_values,
|
||||||
@@ -464,7 +448,17 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration):
|
|||||||
image_newline=self.image_newline,
|
image_newline=self.image_newline,
|
||||||
)
|
)
|
||||||
|
|
||||||
video_features = video_feature_lens = None
|
n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
|
||||||
|
n_image_features = image_features.shape[0]
|
||||||
|
if n_image_tokens != n_image_features:
|
||||||
|
raise ValueError(
|
||||||
|
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
|
||||||
|
)
|
||||||
|
special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
|
||||||
|
special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
|
||||||
|
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
||||||
|
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
|
||||||
|
|
||||||
if pixel_values_videos is not None and pixel_values_videos.size(0) > 0:
|
if pixel_values_videos is not None and pixel_values_videos.size(0) > 0:
|
||||||
video_features = self.get_video_features(
|
video_features = self.get_video_features(
|
||||||
pixel_values_videos,
|
pixel_values_videos,
|
||||||
@@ -476,95 +470,16 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration):
|
|||||||
video_features = torch.cat(video_features, dim=0)
|
video_features = torch.cat(video_features, dim=0)
|
||||||
video_feature_lens = torch.tensor(video_feature_lens, dtype=torch.long, device=video_features.device)
|
video_feature_lens = torch.tensor(video_feature_lens, dtype=torch.long, device=video_features.device)
|
||||||
|
|
||||||
if legacy_processing:
|
n_video_tokens = (input_ids == self.config.video_token_index).sum().item()
|
||||||
logger.warning_once(
|
n_video_features = video_features.shape[0]
|
||||||
"Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. "
|
if n_video_tokens != n_video_features:
|
||||||
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
|
raise ValueError(
|
||||||
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
|
f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
|
||||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
|
|
||||||
)
|
|
||||||
if input_ids.shape[1] != 1:
|
|
||||||
iterator = (
|
|
||||||
(image_features, feature_lens, self.config.image_token_index),
|
|
||||||
(video_features, video_feature_lens, self.config.video_token_index),
|
|
||||||
)
|
)
|
||||||
for features, lens, special_token in iterator:
|
special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1)
|
||||||
if features is not None:
|
special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
|
||||||
(
|
video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
||||||
inputs_embeds,
|
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
|
||||||
attention_mask,
|
|
||||||
position_ids,
|
|
||||||
labels,
|
|
||||||
input_ids,
|
|
||||||
) = self._merge_input_ids_with_image_features(
|
|
||||||
features,
|
|
||||||
lens,
|
|
||||||
inputs_embeds,
|
|
||||||
input_ids,
|
|
||||||
attention_mask,
|
|
||||||
position_ids,
|
|
||||||
labels=labels,
|
|
||||||
image_token_index=special_token,
|
|
||||||
)
|
|
||||||
cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
|
|
||||||
else:
|
|
||||||
# Retrieve the first layer to inspect the logits and mask out the hidden states that are set to 0
|
|
||||||
first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
|
|
||||||
# Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
|
|
||||||
batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
|
|
||||||
# Get the target length
|
|
||||||
target_length = input_ids.shape[1]
|
|
||||||
past_length = first_layer_past_key_value.shape[-1]
|
|
||||||
extended_attention_mask = torch.ones(
|
|
||||||
(attention_mask.shape[0], past_length),
|
|
||||||
dtype=attention_mask.dtype,
|
|
||||||
device=attention_mask.device,
|
|
||||||
)
|
|
||||||
# Filter out only the tokens that can be un-attended, this can happen
|
|
||||||
# if one uses Llava + Fused modules where the cache on the
|
|
||||||
# first iteration is already big enough, or if one passes custom cache
|
|
||||||
valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
|
|
||||||
new_batch_index = batch_index[valid_indices]
|
|
||||||
new_non_attended_tokens = non_attended_tokens[valid_indices]
|
|
||||||
# Zero-out the places where we don't need to attend
|
|
||||||
extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
|
|
||||||
attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
|
|
||||||
position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
|
|
||||||
cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:]
|
|
||||||
|
|
||||||
# TODO: @raushan retain only the new behavior after v4.47
|
|
||||||
else:
|
|
||||||
if image_features is not None:
|
|
||||||
n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
|
|
||||||
n_image_features = image_features.shape[0]
|
|
||||||
|
|
||||||
if n_image_tokens != n_image_features:
|
|
||||||
raise ValueError(
|
|
||||||
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
|
|
||||||
)
|
|
||||||
special_image_mask = (
|
|
||||||
(input_ids == self.config.image_token_index)
|
|
||||||
.unsqueeze(-1)
|
|
||||||
.expand_as(inputs_embeds)
|
|
||||||
.to(inputs_embeds.device)
|
|
||||||
)
|
|
||||||
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
|
||||||
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
|
|
||||||
if video_features is not None:
|
|
||||||
n_video_tokens = (input_ids == self.config.video_token_index).sum().item()
|
|
||||||
n_video_features = video_features.shape[0]
|
|
||||||
if n_video_tokens != n_video_features:
|
|
||||||
raise ValueError(
|
|
||||||
f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
|
|
||||||
)
|
|
||||||
special_image_mask = (
|
|
||||||
(input_ids == self.config.video_token_index)
|
|
||||||
.unsqueeze(-1)
|
|
||||||
.expand_as(inputs_embeds)
|
|
||||||
.to(inputs_embeds.device)
|
|
||||||
)
|
|
||||||
video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
|
||||||
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
|
|
||||||
|
|
||||||
outputs = self.language_model(
|
outputs = self.language_model(
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
|
|||||||
@@ -173,48 +173,33 @@ class LlavaNextVideoProcessor(ProcessorMixin):
|
|||||||
elif not isinstance(text, list) and not isinstance(text[0], str):
|
elif not isinstance(text, list) and not isinstance(text[0], str):
|
||||||
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
|
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
|
||||||
|
|
||||||
if self.patch_size is None or self.vision_feature_select_strategy is None:
|
if image_inputs:
|
||||||
logger.warning_once(
|
image_sizes = iter(image_inputs["image_sizes"])
|
||||||
"Expanding inputs for image/video tokens in LLaVa-NeXT-Video should be done in processing. "
|
height, width = get_image_size(to_numpy_array(image_inputs["pixel_values"][0][0]))
|
||||||
"Please add `patch_size`, `num_additional_image_tokens` and `vision_feature_select_strategy` to the model's processing config or set directly "
|
prompt_strings = []
|
||||||
"with `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` "
|
for sample in text:
|
||||||
"and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
|
while self.image_token in sample:
|
||||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
|
image_size = next(image_sizes)
|
||||||
)
|
orig_height, orig_width = image_size
|
||||||
else:
|
num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
|
||||||
# images expand taking into account num_of_patches in each image
|
if self.vision_feature_select_strategy == "default":
|
||||||
if image_inputs:
|
num_image_tokens -= 1
|
||||||
image_sizes = iter(image_inputs["image_sizes"])
|
sample = sample.replace(self.image_token, "<placeholder>" * num_image_tokens, 1)
|
||||||
height, width = get_image_size(to_numpy_array(image_inputs["pixel_values"][0][0]))
|
prompt_strings.append(sample)
|
||||||
prompt_strings = []
|
text = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
|
||||||
for sample in text:
|
|
||||||
while self.image_token in sample:
|
|
||||||
image_size = next(image_sizes)
|
|
||||||
if not isinstance(image_size, (list, tuple)):
|
|
||||||
# cast to list to avoid numerical precision errors when calculating unpadding
|
|
||||||
image_size = image_size.tolist()
|
|
||||||
orig_height, orig_width = image_size
|
|
||||||
num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
|
|
||||||
if self.vision_feature_select_strategy == "default":
|
|
||||||
num_image_tokens -= self.num_additional_image_tokens
|
|
||||||
sample = sample.replace(self.image_token, "<placeholder>" * num_image_tokens, 1)
|
|
||||||
prompt_strings.append(sample)
|
|
||||||
text = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
|
|
||||||
|
|
||||||
# videos are easier, simply get frames and multiply
|
# videos are easier, simply get frames and multiply
|
||||||
if videos_inputs:
|
if videos_inputs:
|
||||||
one_video = to_numpy_array(videos_inputs.get("pixel_values_videos")[0])
|
one_video = to_numpy_array(videos_inputs.get("pixel_values_videos")[0])
|
||||||
height, width = get_image_size(one_video[0])
|
height, width = get_image_size(one_video[0])
|
||||||
num_frames = one_video.shape[0] # frame dim is always after batch dim
|
num_frames = one_video.shape[0] # frame dim is always after batch dim
|
||||||
|
num_image_tokens = (height // self.patch_size) * (width // self.patch_size)
|
||||||
# no `self.num_additional_image_tokens` added because video always has a default feature selection strategy
|
num_video_tokens = num_image_tokens // 4 * num_frames # divide by 4 needed for avg pooling layer
|
||||||
num_image_tokens = (height // self.patch_size) * (width // self.patch_size)
|
prompt_strings = []
|
||||||
num_video_tokens = num_image_tokens // 4 * num_frames # divide by 4 needed for avg pooling layer
|
for sample in text:
|
||||||
prompt_strings = []
|
sample = sample.replace(self.video_token, self.video_token * num_video_tokens)
|
||||||
for sample in text:
|
prompt_strings.append(sample)
|
||||||
sample = sample.replace(self.video_token, self.video_token * num_video_tokens)
|
text = prompt_strings
|
||||||
prompt_strings.append(sample)
|
|
||||||
text = prompt_strings
|
|
||||||
|
|
||||||
text_inputs = self.tokenizer(
|
text_inputs = self.tokenizer(
|
||||||
text,
|
text,
|
||||||
|
|||||||
@@ -538,127 +538,41 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel, GenerationMi
|
|||||||
"time, and must specify either one"
|
"time, and must specify either one"
|
||||||
)
|
)
|
||||||
|
|
||||||
legacy_processing = False
|
|
||||||
if inputs_embeds is None:
|
if inputs_embeds is None:
|
||||||
inputs_embeds = self.get_input_embeddings()(input_ids)
|
inputs_embeds = self.get_input_embeddings()(input_ids)
|
||||||
|
|
||||||
# if the number of image/video tokens is more than image embeddings seq length, then prob we expanded it in processing
|
|
||||||
# not very reliable, but we don't expect one to actually pass 500+ images for one prompt
|
|
||||||
img_token_not_enough = (input_ids == self.config.image_token_index).sum(
|
|
||||||
1
|
|
||||||
).max() < self.config.image_seq_length
|
|
||||||
video_token_not_enough = (input_ids == self.config.video_token_index).sum(
|
|
||||||
1
|
|
||||||
).max() < self.config.video_seq_length
|
|
||||||
inputs_not_expanded = (img_token_not_enough and pixel_values_images is not None) or (
|
|
||||||
video_token_not_enough and pixel_values_videos is not None
|
|
||||||
)
|
|
||||||
pixels_present = input_ids.shape[-1] == 1 and (
|
|
||||||
pixel_values_images is not None or pixel_values_videos is not None
|
|
||||||
)
|
|
||||||
legacy_processing = inputs_not_expanded or pixels_present
|
|
||||||
|
|
||||||
image_features = None
|
|
||||||
if pixel_values_images is not None:
|
if pixel_values_images is not None:
|
||||||
image_features = self.get_image_features(
|
image_features = self.get_image_features(
|
||||||
pixel_values_images,
|
pixel_values_images,
|
||||||
vision_feature_layer=vision_feature_layer,
|
vision_feature_layer=vision_feature_layer,
|
||||||
vision_feature_select_strategy=vision_feature_select_strategy,
|
vision_feature_select_strategy=vision_feature_select_strategy,
|
||||||
)
|
)
|
||||||
|
n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
|
||||||
|
n_image_features = image_features.shape[0] * image_features.shape[1]
|
||||||
|
if n_image_tokens != n_image_features:
|
||||||
|
raise ValueError(
|
||||||
|
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
|
||||||
|
)
|
||||||
|
special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
|
||||||
|
special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
|
||||||
|
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
||||||
|
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
|
||||||
|
|
||||||
video_features = None
|
|
||||||
num_frames = 0
|
|
||||||
if pixel_values_videos is not None:
|
if pixel_values_videos is not None:
|
||||||
video_features, num_frames = self.get_video_features(
|
video_features, num_frames = self.get_video_features(
|
||||||
pixel_values_videos=pixel_values_videos, vision_feature_layer=vision_feature_layer
|
pixel_values_videos=pixel_values_videos, vision_feature_layer=vision_feature_layer
|
||||||
)
|
)
|
||||||
|
|
||||||
if legacy_processing:
|
n_video_tokens = (input_ids == self.config.video_token_index).sum().item()
|
||||||
logger.warning_once(
|
n_video_features = video_features.shape[0] * video_features.shape[1]
|
||||||
"Expanding inputs for image tokens in Video-LLaVa should be done in processing. "
|
if n_video_tokens != n_video_features:
|
||||||
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
|
raise ValueError(
|
||||||
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
|
f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
|
||||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
|
|
||||||
)
|
|
||||||
if input_ids.shape[1] != 1:
|
|
||||||
for features, frames in ((image_features, 1), (video_features, num_frames)):
|
|
||||||
if features is not None:
|
|
||||||
(
|
|
||||||
inputs_embeds,
|
|
||||||
attention_mask,
|
|
||||||
labels,
|
|
||||||
position_ids,
|
|
||||||
input_ids,
|
|
||||||
) = self._merge_input_ids_with_visual_features(
|
|
||||||
features,
|
|
||||||
inputs_embeds,
|
|
||||||
input_ids,
|
|
||||||
attention_mask,
|
|
||||||
labels,
|
|
||||||
num_frames=frames,
|
|
||||||
)
|
|
||||||
cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
|
|
||||||
else:
|
|
||||||
# Retrieve the first layer to inspect the logits and mask out the hidden states
|
|
||||||
# that are set to 0
|
|
||||||
first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
|
|
||||||
|
|
||||||
# Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
|
|
||||||
batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
|
|
||||||
target_length = input_ids.shape[1]
|
|
||||||
past_length = first_layer_past_key_value.shape[-1]
|
|
||||||
extended_attention_mask = torch.ones(
|
|
||||||
(attention_mask.shape[0], past_length),
|
|
||||||
dtype=attention_mask.dtype,
|
|
||||||
device=attention_mask.device,
|
|
||||||
)
|
)
|
||||||
|
special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1)
|
||||||
# Filter out only the tokens that can be un-attended, this can happen
|
special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
|
||||||
# if one uses Llava + Fused modules where the cache on the
|
video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
||||||
# first iteration is already big enough, or if one passes custom cache
|
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
|
||||||
valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
|
|
||||||
new_batch_index = batch_index[valid_indices]
|
|
||||||
new_non_attended_tokens = non_attended_tokens[valid_indices]
|
|
||||||
|
|
||||||
# Zero-out the places where we don't need to attend
|
|
||||||
extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
|
|
||||||
attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
|
|
||||||
position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
|
|
||||||
cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:]
|
|
||||||
|
|
||||||
# TODO: @raushan retain only the new behavior after v4.47
|
|
||||||
else:
|
|
||||||
if pixel_values_images is not None:
|
|
||||||
n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
|
|
||||||
n_image_features = image_features.shape[0] * image_features.shape[1]
|
|
||||||
if n_image_tokens != n_image_features:
|
|
||||||
raise ValueError(
|
|
||||||
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
|
|
||||||
)
|
|
||||||
special_image_mask = (
|
|
||||||
(input_ids == self.config.image_token_index)
|
|
||||||
.unsqueeze(-1)
|
|
||||||
.expand_as(inputs_embeds)
|
|
||||||
.to(inputs_embeds.device)
|
|
||||||
)
|
|
||||||
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
|
||||||
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
|
|
||||||
|
|
||||||
if pixel_values_videos is not None:
|
|
||||||
n_video_tokens = (input_ids == self.config.video_token_index).sum().item()
|
|
||||||
n_video_features = video_features.shape[0] * video_features.shape[1]
|
|
||||||
if n_video_tokens != n_video_features:
|
|
||||||
raise ValueError(
|
|
||||||
f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
|
|
||||||
)
|
|
||||||
special_image_mask = (
|
|
||||||
(input_ids == self.config.video_token_index)
|
|
||||||
.unsqueeze(-1)
|
|
||||||
.expand_as(inputs_embeds)
|
|
||||||
.to(inputs_embeds.device)
|
|
||||||
)
|
|
||||||
video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
|
||||||
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
|
|
||||||
|
|
||||||
outputs = self.language_model(
|
outputs = self.language_model(
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
|
|||||||
@@ -158,16 +158,8 @@ class VideoLlavaProcessor(ProcessorMixin):
|
|||||||
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
|
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
|
||||||
|
|
||||||
prompt_strings = text
|
prompt_strings = text
|
||||||
if encoded_images is not None and (self.patch_size is None or self.vision_feature_select_strategy is None):
|
|
||||||
logger.warning_once(
|
if encoded_images is not None:
|
||||||
"Expanding inputs for image tokens in Video-LLaVa should be done in processing. "
|
|
||||||
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set "
|
|
||||||
"directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = "
|
|
||||||
"{{vision_feature_select_strategy}}`. Using processors without these attributes in the config is "
|
|
||||||
"deprecated and will throw an error in v4.50."
|
|
||||||
)
|
|
||||||
# Replace the image/video tokens with the expanded token sequence
|
|
||||||
elif encoded_images is not None:
|
|
||||||
if "pixel_values_images" in encoded_images.keys():
|
if "pixel_values_images" in encoded_images.keys():
|
||||||
height, width = get_image_size(to_numpy_array(encoded_images.get("pixel_values_images")[0]))
|
height, width = get_image_size(to_numpy_array(encoded_images.get("pixel_values_images")[0]))
|
||||||
num_frames = 1
|
num_frames = 1
|
||||||
|
|||||||
@@ -455,80 +455,22 @@ class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel, GenerationMixin)
|
|||||||
"You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
|
"You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
|
||||||
)
|
)
|
||||||
|
|
||||||
legacy_processing = False
|
|
||||||
if inputs_embeds is None:
|
if inputs_embeds is None:
|
||||||
inputs_embeds = self.get_input_embeddings()(input_ids)
|
inputs_embeds = self.get_input_embeddings()(input_ids)
|
||||||
|
|
||||||
# if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
|
|
||||||
# not very reliable, but we don't expect one to actually pass 500+ images for one prompt
|
|
||||||
# In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
|
|
||||||
legacy_processing = (
|
|
||||||
(input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
|
|
||||||
) or (input_ids.shape[-1] == 1 and pixel_values is not None)
|
|
||||||
|
|
||||||
image_features = None
|
|
||||||
if pixel_values is not None:
|
if pixel_values is not None:
|
||||||
image_features = self.get_image_features(
|
image_features = self.get_image_features(
|
||||||
pixel_values=pixel_values, vision_feature_layers=vision_feature_layers
|
pixel_values=pixel_values, vision_feature_layers=vision_feature_layers
|
||||||
)
|
)
|
||||||
|
|
||||||
if legacy_processing:
|
|
||||||
logger.warning_once(
|
|
||||||
"Expanding inputs for image tokens in VipLLaVa should be done in processing. "
|
|
||||||
"Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
|
|
||||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
|
|
||||||
)
|
|
||||||
# prefill stage vs decoding stage (legacy behavior copied)
|
|
||||||
if input_ids.shape[1] != 1:
|
|
||||||
inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
|
|
||||||
image_features, inputs_embeds, input_ids, attention_mask, labels
|
|
||||||
)
|
|
||||||
cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
|
|
||||||
else:
|
|
||||||
# Retrieve the first layer to inspect the logits and mask out the hidden states
|
|
||||||
# that are set to 0
|
|
||||||
first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
|
|
||||||
|
|
||||||
# Sum all dimensions of head_dim (-1) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
|
|
||||||
batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
|
|
||||||
|
|
||||||
target_length = input_ids.shape[1]
|
|
||||||
past_length = first_layer_past_key_value.shape[-1]
|
|
||||||
|
|
||||||
extended_attention_mask = torch.ones(
|
|
||||||
(attention_mask.shape[0], past_length),
|
|
||||||
dtype=attention_mask.dtype,
|
|
||||||
device=attention_mask.device,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Filter out only the tokens that can be un-attended, this can happen
|
|
||||||
# in the case one uses Llava + Fused modules where the cache on the
|
|
||||||
# first iteration is already big enough, or if one passes custom cache
|
|
||||||
valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
|
|
||||||
new_batch_index = batch_index[valid_indices]
|
|
||||||
new_non_attended_tokens = non_attended_tokens[valid_indices]
|
|
||||||
|
|
||||||
# Zero-out the places where we don't need to attend
|
|
||||||
extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
|
|
||||||
|
|
||||||
attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
|
|
||||||
position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
|
|
||||||
cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:]
|
|
||||||
|
|
||||||
# TODO: @raushan retain only the new behavior after v4.47
|
|
||||||
elif image_features is not None:
|
|
||||||
n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
|
n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
|
||||||
n_image_features = image_features.shape[0] * image_features.shape[1]
|
n_image_features = image_features.shape[0] * image_features.shape[1]
|
||||||
if n_image_tokens != n_image_features:
|
if n_image_tokens != n_image_features:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
|
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
|
||||||
)
|
)
|
||||||
special_image_mask = (
|
special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
|
||||||
(input_ids == self.config.image_token_index)
|
special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
|
||||||
.unsqueeze(-1)
|
|
||||||
.expand_as(inputs_embeds)
|
|
||||||
.to(inputs_embeds.device)
|
|
||||||
)
|
|
||||||
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
||||||
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
|
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
|
||||||
|
|
||||||
|
|||||||
@@ -327,10 +327,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
prompt = "<image>\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:"
|
prompt = "<image>\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:"
|
||||||
image_file = "https://llava-vl.github.io/static/images/view.jpg"
|
image_file = "https://llava-vl.github.io/static/images/view.jpg"
|
||||||
raw_image = Image.open(requests.get(image_file, stream=True).raw)
|
raw_image = Image.open(requests.get(image_file, stream=True).raw)
|
||||||
inputs = self.processor(images=raw_image, text=prompt, return_tensors="pt")
|
inputs = self.processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device)
|
||||||
|
|
||||||
EXPECTED_INPUT_IDS = torch.tensor([[1, 32000, 28705, 13, 11123, 28747, 1824, 460, 272, 1722,315, 1023, 347, 13831, 925, 684, 739, 315, 3251, 456,1633, 28804, 13, 4816, 8048, 12738, 28747]]) # fmt: skip
|
|
||||||
self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS))
|
|
||||||
|
|
||||||
output = model.generate(**inputs, max_new_tokens=20)
|
output = model.generate(**inputs, max_new_tokens=20)
|
||||||
EXPECTED_DECODED_TEXT = "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT: When visiting this place, there are a few things one should be cautious about. Firstly," # fmt: skip
|
EXPECTED_DECODED_TEXT = "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT: When visiting this place, there are a few things one should be cautious about. Firstly," # fmt: skip
|
||||||
@@ -378,7 +375,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
|
image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
|
||||||
image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
|
image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
|
||||||
|
|
||||||
inputs = processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True)
|
inputs = processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True).to(torch_device)
|
||||||
|
|
||||||
output = model.generate(**inputs, max_new_tokens=20)
|
output = model.generate(**inputs, max_new_tokens=20)
|
||||||
|
|
||||||
@@ -402,7 +399,9 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
|
image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
|
||||||
image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
|
image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
|
||||||
|
|
||||||
inputs = self.processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True)
|
inputs = self.processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True).to(
|
||||||
|
torch_device
|
||||||
|
)
|
||||||
|
|
||||||
output = model.generate(**inputs, max_new_tokens=20)
|
output = model.generate(**inputs, max_new_tokens=20)
|
||||||
|
|
||||||
@@ -434,7 +433,9 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
|
image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
|
||||||
image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
|
image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
|
||||||
|
|
||||||
inputs = processor(images=[image1, image2, image1], text=prompts, return_tensors="pt", padding=True)
|
inputs = processor(images=[image1, image2, image1], text=prompts, return_tensors="pt", padding=True).to(
|
||||||
|
torch_device
|
||||||
|
)
|
||||||
|
|
||||||
output = model.generate(**inputs, max_new_tokens=20)
|
output = model.generate(**inputs, max_new_tokens=20)
|
||||||
|
|
||||||
@@ -508,32 +509,18 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
# This is a reproducer of https://github.com/huggingface/transformers/pull/28333 and makes sure it does not happen anymore
|
# This is a reproducer of https://github.com/huggingface/transformers/pull/28333 and makes sure it does not happen anymore
|
||||||
model_id = "llava-hf/llava-1.5-7b-hf"
|
model_id = "llava-hf/llava-1.5-7b-hf"
|
||||||
model = LlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
|
model = LlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
|
||||||
|
processor = AutoProcessor.from_pretrained(model_id)
|
||||||
|
|
||||||
# Simulate some user inputs
|
prompt = "USER: <image>\nDescribe the imageASSISTANT:"
|
||||||
pixel_values = torch.randn(
|
image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||||
(1, 3, 336, 336),
|
|
||||||
dtype=torch.float,
|
raw_image = Image.open(requests.get(image_file, stream=True).raw)
|
||||||
device=torch_device,
|
inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
|
||||||
)
|
|
||||||
input_ids = torch.tensor(
|
|
||||||
[
|
|
||||||
[32001, 32001, 1, 15043, 7084, 32000, 29871, 13, 7900],
|
|
||||||
],
|
|
||||||
dtype=torch.long,
|
|
||||||
device=torch_device,
|
|
||||||
)
|
|
||||||
attention_mask = torch.tensor(
|
|
||||||
[[0, 0, 1, 1, 1, 1, 1, 1, 1]],
|
|
||||||
dtype=torch.long,
|
|
||||||
device=torch_device,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Make sure that the loss is properly computed
|
# Make sure that the loss is properly computed
|
||||||
loss = model(
|
loss = model(
|
||||||
pixel_values=pixel_values,
|
**inputs,
|
||||||
input_ids=input_ids,
|
labels=inputs.input_ids.clone(),
|
||||||
attention_mask=attention_mask,
|
|
||||||
labels=input_ids,
|
|
||||||
).loss
|
).loss
|
||||||
loss.backward()
|
loss.backward()
|
||||||
|
|
||||||
@@ -593,38 +580,6 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
EXPECTED_DECODED_TEXT = "user\n\nWhat are these?\nassistant The image shows two cats, one on the left and one on the right. They appear to be resting or sleeping on a pink blanket. The cat"
|
EXPECTED_DECODED_TEXT = "user\n\nWhat are these?\nassistant The image shows two cats, one on the left and one on the right. They appear to be resting or sleeping on a pink blanket. The cat"
|
||||||
self.assertTrue(processor.batch_decode(output, skip_special_tokens=True)[0] == EXPECTED_DECODED_TEXT)
|
self.assertTrue(processor.batch_decode(output, skip_special_tokens=True)[0] == EXPECTED_DECODED_TEXT)
|
||||||
|
|
||||||
@slow
|
|
||||||
@require_bitsandbytes
|
|
||||||
def test_expansion_in_processing(self):
|
|
||||||
model_id = "llava-hf/llava-1.5-7b-hf"
|
|
||||||
model = LlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
|
|
||||||
processor = AutoProcessor.from_pretrained(model_id)
|
|
||||||
|
|
||||||
prompt = "USER: <image>\nDescribe the image:\nASSISTANT:"
|
|
||||||
image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
||||||
raw_image = Image.open(requests.get(image_file, stream=True).raw)
|
|
||||||
|
|
||||||
# check processing with expansion of inputs
|
|
||||||
processor.vision_feature_select_strategy = "default"
|
|
||||||
processor.num_additional_image_tokens = 1
|
|
||||||
processor.patch_size = 14
|
|
||||||
inputs_expanded = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
|
|
||||||
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 593)
|
|
||||||
|
|
||||||
# check processing without expansion of inputs (legacy behavior)
|
|
||||||
processor.vision_feature_select_strategy = None
|
|
||||||
processor.patch_size = None
|
|
||||||
processor.num_additional_image_tokens = None
|
|
||||||
inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
|
|
||||||
self.assertTrue(inputs.input_ids.shape[-1] == 18)
|
|
||||||
|
|
||||||
# generate exactly 20 tokens
|
|
||||||
output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
|
|
||||||
output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
|
|
||||||
|
|
||||||
# check that both inputs are handled correctly and generate the same output
|
|
||||||
self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
|
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
@require_bitsandbytes
|
@require_bitsandbytes
|
||||||
def test_pixtral(self):
|
def test_pixtral(self):
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
shutil.rmtree(self.tmpdirname)
|
shutil.rmtree(self.tmpdirname)
|
||||||
|
|
||||||
def prepare_processor_dict(self):
|
def prepare_processor_dict(self):
|
||||||
return {"chat_template": "dummy_template"}
|
return {"chat_template": "dummy_template", "patch_size": 3, "vision_feature_select_strategy": "default"}
|
||||||
|
|
||||||
@unittest.skip(
|
@unittest.skip(
|
||||||
"Skip because the model has no processor kwargs except for chat template and"
|
"Skip because the model has no processor kwargs except for chat template and"
|
||||||
|
|||||||
@@ -396,8 +396,10 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
)
|
)
|
||||||
original_input_ids = torch.load(filepath, map_location="cpu")
|
original_input_ids = torch.load(filepath, map_location="cpu")
|
||||||
# replace -200 by image_token_index (since we use token ID = 32000 for the image token)
|
# replace -200 by image_token_index (since we use token ID = 32000 for the image token)
|
||||||
original_input_ids[original_input_ids == -200] = model.config.image_token_index
|
# remove image token indices because HF impl expands image tokens `image_seq_length` times
|
||||||
assert original_input_ids[0].tolist() == inputs.input_ids[0].tolist()
|
original_input_ids = original_input_ids[original_input_ids != -200]
|
||||||
|
observed_input_ids = inputs.input_ids[inputs.input_ids != model.config.image_token_index]
|
||||||
|
assert original_input_ids[0].tolist() == observed_input_ids[0].tolist()
|
||||||
|
|
||||||
filepath = hf_hub_download(
|
filepath = hf_hub_download(
|
||||||
repo_id="nielsr/test-image",
|
repo_id="nielsr/test-image",
|
||||||
@@ -414,7 +416,7 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
|
|
||||||
expected_slice = torch.tensor(
|
expected_slice = torch.tensor(
|
||||||
[[-4.7695, -4.5664, -0.2788], [-10.6172, -10.8828, -2.5273], [-6.7383, -7.2422, -0.6694]],
|
[[-4.7695, -4.5664, -0.2788], [-10.6172, -10.8828, -2.5273], [-6.7383, -7.2422, -0.6694]],
|
||||||
dtype=torch.float32,
|
dtype=torch.float16,
|
||||||
device=torch_device,
|
device=torch_device,
|
||||||
)
|
)
|
||||||
assert torch.allclose(output.logits[0, :3, :3], expected_slice, atol=1e-3)
|
assert torch.allclose(output.logits[0, :3, :3], expected_slice, atol=1e-3)
|
||||||
@@ -518,11 +520,11 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
|
|
||||||
expected_slice = torch.tensor(
|
expected_slice = torch.tensor(
|
||||||
[[-0.1287, -0.1294, -0.1284], [-0.2744, -0.2698, -0.2671], [-0.1071, -0.1091, -0.1056]],
|
[[-0.1287, -0.1294, -0.1284], [-0.2744, -0.2698, -0.2671], [-0.1071, -0.1091, -0.1056]],
|
||||||
dtype=torch.float32,
|
dtype=torch.float16,
|
||||||
device=torch_device,
|
device=torch_device,
|
||||||
)
|
)
|
||||||
assert torch.allclose(output.logits[0, -3:, -3:], expected_slice, atol=1e-3)
|
assert torch.allclose(output.logits[0, -3:, -3:], expected_slice, atol=1e-3)
|
||||||
assert torch.allclose(output.loss, torch.tensor(7.0206, device=torch_device), atol=1e-3)
|
assert torch.allclose(output.loss, torch.tensor(7.0206, dtype=torch.float16, device=torch_device), atol=1e-3)
|
||||||
|
|
||||||
# verify generation
|
# verify generation
|
||||||
output = model.generate(**inputs, max_new_tokens=50)
|
output = model.generate(**inputs, max_new_tokens=50)
|
||||||
@@ -601,80 +603,6 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
|
|
||||||
self.assertIn("Padding side is set to 'right' but the model is in inference mode. For correct", logs)
|
self.assertIn("Padding side is set to 'right' but the model is in inference mode. For correct", logs)
|
||||||
|
|
||||||
@slow
|
|
||||||
@require_bitsandbytes
|
|
||||||
def test_expansion_in_processing_multiimage(self):
|
|
||||||
model_id = "llava-hf/llava-v1.6-mistral-7b-hf"
|
|
||||||
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
|
|
||||||
processor = AutoProcessor.from_pretrained(model_id)
|
|
||||||
|
|
||||||
prompt = "USER: <image><image>\nDescribe the similarity between the two images:\nASSISTANT:"
|
|
||||||
image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
||||||
raw_image = Image.open(requests.get(image_file, stream=True).raw)
|
|
||||||
deer_image = Image.open(
|
|
||||||
requests.get(
|
|
||||||
"https://4.img-dpreview.com/files/p/TS560x560~forums/56876524/03975b28741443319e9a94615e35667e",
|
|
||||||
stream=True,
|
|
||||||
).raw
|
|
||||||
)
|
|
||||||
|
|
||||||
# check processing with expansion of inputs
|
|
||||||
processor.vision_feature_select_strategy = "default"
|
|
||||||
processor.patch_size = 14
|
|
||||||
processor.num_additional_image_tokens = 1
|
|
||||||
inputs_expanded = processor(text=prompt, images=[raw_image, deer_image], return_tensors="pt").to(
|
|
||||||
torch_device, torch.float16
|
|
||||||
)
|
|
||||||
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 3969)
|
|
||||||
|
|
||||||
# check processing without expansion of inputs (legacy behavior)
|
|
||||||
processor.vision_feature_select_strategy = None
|
|
||||||
processor.patch_size = None
|
|
||||||
processor.num_additional_image_tokens = None
|
|
||||||
inputs = processor(text=prompt, images=[raw_image, deer_image], return_tensors="pt").to(
|
|
||||||
torch_device, torch.float16
|
|
||||||
)
|
|
||||||
self.assertTrue(inputs.input_ids.shape[-1] == 23)
|
|
||||||
|
|
||||||
# generate exactly 20 tokens
|
|
||||||
output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
|
|
||||||
output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
|
|
||||||
|
|
||||||
# check that both inputs are handled correctly and generate the same output
|
|
||||||
self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
|
|
||||||
|
|
||||||
@slow
|
|
||||||
@require_bitsandbytes
|
|
||||||
def test_expansion_in_processing(self):
|
|
||||||
model_id = "llava-hf/llava-v1.6-mistral-7b-hf"
|
|
||||||
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
|
|
||||||
processor = AutoProcessor.from_pretrained(model_id)
|
|
||||||
|
|
||||||
prompt = "USER: <image>\nDescribe the image:\nASSISTANT:"
|
|
||||||
image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
||||||
raw_image = Image.open(requests.get(image_file, stream=True).raw)
|
|
||||||
|
|
||||||
# check processing with expansion of inputs
|
|
||||||
processor.vision_feature_select_strategy = "default"
|
|
||||||
processor.patch_size = 14
|
|
||||||
processor.num_additional_image_tokens = 1
|
|
||||||
inputs_expanded = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
|
|
||||||
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2356)
|
|
||||||
|
|
||||||
# check processing without expansion of inputs (legacy behavior)
|
|
||||||
processor.vision_feature_select_strategy = None
|
|
||||||
processor.patch_size = None
|
|
||||||
processor.num_additional_image_tokens = None
|
|
||||||
inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
|
|
||||||
self.assertTrue(inputs.input_ids.shape[-1] == 17)
|
|
||||||
|
|
||||||
# generate exactly 20 tokens
|
|
||||||
output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
|
|
||||||
output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
|
|
||||||
|
|
||||||
# check that both inputs are handled correctly and generate the same output
|
|
||||||
self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
|
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
@require_bitsandbytes
|
@require_bitsandbytes
|
||||||
def test_small_model_integration_test_full_vision_state_selection(self):
|
def test_small_model_integration_test_full_vision_state_selection(self):
|
||||||
@@ -685,7 +613,7 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
# test that changing `strategy` won't error out
|
# test that changing `strategy` won't error out
|
||||||
model.vision_feature_select_strategy = "full"
|
model.vision_feature_select_strategy = "full"
|
||||||
|
|
||||||
inputs = self.processor(self.prompt, self.image, return_tensors="pt")
|
inputs = self.processor(self.prompt, self.image, return_tensors="pt").to(model.device)
|
||||||
|
|
||||||
# verify generation
|
# verify generation
|
||||||
output = model.generate(**inputs, max_new_tokens=30)
|
output = model.generate(**inputs, max_new_tokens=30)
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ from ...test_processing_common import ProcessorTesterMixin
|
|||||||
|
|
||||||
|
|
||||||
if is_vision_available():
|
if is_vision_available():
|
||||||
from transformers import CLIPImageProcessor
|
from transformers import LlavaNextImageProcessor
|
||||||
|
|
||||||
|
|
||||||
@require_vision
|
@require_vision
|
||||||
@@ -37,7 +37,7 @@ class LlavaNextProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.tmpdirname = tempfile.mkdtemp()
|
self.tmpdirname = tempfile.mkdtemp()
|
||||||
|
|
||||||
image_processor = CLIPImageProcessor()
|
image_processor = LlavaNextImageProcessor()
|
||||||
tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b")
|
tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b")
|
||||||
processor_kwargs = self.prepare_processor_dict()
|
processor_kwargs = self.prepare_processor_dict()
|
||||||
processor = LlavaNextProcessor(image_processor, tokenizer, **processor_kwargs)
|
processor = LlavaNextProcessor(image_processor, tokenizer, **processor_kwargs)
|
||||||
@@ -50,7 +50,7 @@ class LlavaNextProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
return LlavaNextProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
|
return LlavaNextProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
|
||||||
|
|
||||||
def prepare_processor_dict(self):
|
def prepare_processor_dict(self):
|
||||||
return {"chat_template": "dummy_template"}
|
return {"chat_template": "dummy_template", "patch_size": 3, "vision_feature_select_strategy": "default"}
|
||||||
|
|
||||||
@unittest.skip(
|
@unittest.skip(
|
||||||
"Skip because the model has no processor kwargs except for chat template and"
|
"Skip because the model has no processor kwargs except for chat template and"
|
||||||
|
|||||||
@@ -17,7 +17,6 @@
|
|||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import requests
|
|
||||||
from huggingface_hub import hf_hub_download
|
from huggingface_hub import hf_hub_download
|
||||||
|
|
||||||
from transformers import (
|
from transformers import (
|
||||||
@@ -543,107 +542,3 @@ class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
model(**inputs_batched, output_hidden_states=True)
|
model(**inputs_batched, output_hidden_states=True)
|
||||||
|
|
||||||
self.assertIn("Padding side is set to 'right' but the model is in inference mode. For correct", logs)
|
self.assertIn("Padding side is set to 'right' but the model is in inference mode. For correct", logs)
|
||||||
|
|
||||||
@slow
|
|
||||||
@require_bitsandbytes
|
|
||||||
def test_expansion_in_processing(self):
|
|
||||||
model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"
|
|
||||||
model = LlavaNextVideoForConditionalGeneration.from_pretrained(
|
|
||||||
"llava-hf/LLaVA-NeXT-Video-7B-hf", load_in_4bit=True
|
|
||||||
)
|
|
||||||
processor = AutoProcessor.from_pretrained(model_id)
|
|
||||||
|
|
||||||
# check processing with expansion of inputs
|
|
||||||
processor.vision_feature_select_strategy = "default"
|
|
||||||
processor.patch_size = 14
|
|
||||||
processor.num_additional_image_tokens = 1
|
|
||||||
inputs_expanded = processor(self.prompt_video, videos=[self.video], return_tensors="pt").to(torch_device)
|
|
||||||
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 1170)
|
|
||||||
|
|
||||||
# check processing without expansion of inputs (legacy behavior)
|
|
||||||
processor.vision_feature_select_strategy = None
|
|
||||||
processor.patch_size = None
|
|
||||||
processor.num_additional_image_tokens = None
|
|
||||||
inputs = processor(self.prompt_video, videos=[self.video], return_tensors="pt").to(torch_device)
|
|
||||||
self.assertTrue(inputs.input_ids.shape[-1] == 19)
|
|
||||||
|
|
||||||
# generate exactly 20 tokens
|
|
||||||
output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
|
|
||||||
output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
|
|
||||||
|
|
||||||
# check that both inputs are handled correctly and generate the same output
|
|
||||||
self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
|
|
||||||
|
|
||||||
@slow
|
|
||||||
@require_bitsandbytes
|
|
||||||
def test_expansion_in_processing_images(self):
|
|
||||||
model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"
|
|
||||||
model = LlavaNextVideoForConditionalGeneration.from_pretrained(
|
|
||||||
"llava-hf/LLaVA-NeXT-Video-7B-hf", load_in_4bit=True
|
|
||||||
)
|
|
||||||
processor = AutoProcessor.from_pretrained(model_id)
|
|
||||||
|
|
||||||
# check processing with expansion of inputs
|
|
||||||
processor.vision_feature_select_strategy = "default"
|
|
||||||
processor.patch_size = 14
|
|
||||||
processor.num_additional_image_tokens = 1
|
|
||||||
inputs_expanded = processor(self.prompt_image, images=[self.image], return_tensors="pt").to(torch_device)
|
|
||||||
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2652)
|
|
||||||
|
|
||||||
# check processing without expansion of inputs (legacy behavior)
|
|
||||||
processor.vision_feature_select_strategy = None
|
|
||||||
processor.patch_size = None
|
|
||||||
processor.num_additional_image_tokens = None
|
|
||||||
inputs = processor(self.prompt_image, images=[self.image], return_tensors="pt").to(torch_device)
|
|
||||||
self.assertTrue(inputs.input_ids.shape[-1] == 19)
|
|
||||||
|
|
||||||
# generate exactly 20 tokens
|
|
||||||
output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
|
|
||||||
output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
|
|
||||||
|
|
||||||
# check that both inputs are handled correctly and generate the same output
|
|
||||||
self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
|
|
||||||
|
|
||||||
@slow
|
|
||||||
@require_bitsandbytes
|
|
||||||
def test_expansion_in_processing_multiimage(self):
|
|
||||||
model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"
|
|
||||||
model = LlavaNextVideoForConditionalGeneration.from_pretrained(
|
|
||||||
"llava-hf/LLaVA-NeXT-Video-7B-hf", load_in_4bit=True
|
|
||||||
)
|
|
||||||
processor = AutoProcessor.from_pretrained(model_id)
|
|
||||||
|
|
||||||
prompt = "USER: <image><image>\nDescribe the similarity between the two images:\nASSISTANT:"
|
|
||||||
image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
||||||
raw_image = Image.open(requests.get(image_file, stream=True).raw)
|
|
||||||
deer_image = Image.open(
|
|
||||||
requests.get(
|
|
||||||
"https://4.img-dpreview.com/files/p/TS560x560~forums/56876524/03975b28741443319e9a94615e35667e",
|
|
||||||
stream=True,
|
|
||||||
).raw
|
|
||||||
)
|
|
||||||
|
|
||||||
# check processing with expansion of inputs
|
|
||||||
processor.vision_feature_select_strategy = "default"
|
|
||||||
processor.patch_size = 14
|
|
||||||
processor.num_additional_image_tokens = 1
|
|
||||||
inputs_expanded = processor(text=prompt, images=[raw_image, deer_image], return_tensors="pt").to(
|
|
||||||
torch_device, torch.float16
|
|
||||||
)
|
|
||||||
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 3968)
|
|
||||||
|
|
||||||
# check processing without expansion of inputs (legacy behavior)
|
|
||||||
processor.vision_feature_select_strategy = None
|
|
||||||
processor.patch_size = None
|
|
||||||
processor.num_additional_image_tokens = None
|
|
||||||
inputs = processor(text=prompt, images=[raw_image, deer_image], return_tensors="pt").to(
|
|
||||||
torch_device, torch.float16
|
|
||||||
)
|
|
||||||
self.assertTrue(inputs.input_ids.shape[-1] == 22)
|
|
||||||
|
|
||||||
# generate exactly 20 tokens
|
|
||||||
output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
|
|
||||||
output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
|
|
||||||
|
|
||||||
# check that both inputs are handled correctly and generate the same output
|
|
||||||
self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
|
|
||||||
|
|||||||
@@ -127,7 +127,6 @@ class VideoLlavaVisionText2TextModelTester:
|
|||||||
self.num_image_tokens = (vision_config["image_size"] // vision_config["patch_size"]) ** 2
|
self.num_image_tokens = (vision_config["image_size"] // vision_config["patch_size"]) ** 2
|
||||||
self.num_video_tokens = (self.num_image_tokens + 1) * self.num_frames
|
self.num_video_tokens = (self.num_image_tokens + 1) * self.num_frames
|
||||||
self.seq_length = seq_length + self.num_image_tokens + self.num_video_tokens
|
self.seq_length = seq_length + self.num_image_tokens + self.num_video_tokens
|
||||||
self.encoder_seq_length = self.seq_length
|
|
||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
return VideoLlavaConfig(
|
return VideoLlavaConfig(
|
||||||
@@ -185,22 +184,6 @@ class VideoLlavaVisionText2TextModelTester:
|
|||||||
}
|
}
|
||||||
return config, inputs_dict
|
return config, inputs_dict
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_batched_test(self):
|
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
|
||||||
config, _, pixel_values_videos = config_and_inputs
|
|
||||||
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
|
|
||||||
attention_mask = input_ids.ne(1).to(torch_device)
|
|
||||||
|
|
||||||
# make sure no other special tokens are set
|
|
||||||
input_ids[(input_ids == 0) | (input_ids == 1)] = 3
|
|
||||||
input_ids[:, 0] = config.video_token_index
|
|
||||||
inputs_dict = {
|
|
||||||
"pixel_values_videos": pixel_values_videos,
|
|
||||||
"input_ids": input_ids,
|
|
||||||
"attention_mask": attention_mask,
|
|
||||||
}
|
|
||||||
return config, inputs_dict
|
|
||||||
|
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
|
class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
|
||||||
@@ -339,7 +322,7 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
config, batched_input = self.model_tester.prepare_config_and_inputs_for_batched_test()
|
config, batched_input = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
config.output_hidden_states = True
|
config.output_hidden_states = True
|
||||||
@@ -457,11 +440,11 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
|
repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
|
||||||
)
|
)
|
||||||
video_file = np.load(video_file)
|
video_file = np.load(video_file)
|
||||||
inputs = self.processor(prompt, videos=video_file, return_tensors="pt")
|
inputs = self.processor(prompt, videos=video_file, return_tensors="pt").to(torch_device)
|
||||||
|
|
||||||
EXPECTED_INPUT_IDS = torch.tensor([[1, 3148, 1001, 29901, 29871, 32001, 13, 11008, 338, 445, 4863, 2090, 1460, 29973, 319, 1799, 9047, 13566, 29901]]) # fmt: skip
|
EXPECTED_INPUT_IDS = torch.tensor([1, 3148, 1001, 29901, 29871, 13, 11008, 338, 445, 4863, 2090, 1460, 29973, 319, 1799, 9047, 13566, 29901], device=torch_device) # fmt: skip
|
||||||
|
non_video_inputs = inputs["input_ids"][inputs["input_ids"] != 32001]
|
||||||
self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS))
|
self.assertTrue(torch.equal(non_video_inputs, EXPECTED_INPUT_IDS))
|
||||||
|
|
||||||
output = model.generate(**inputs, do_sample=False, max_new_tokens=20)
|
output = model.generate(**inputs, do_sample=False, max_new_tokens=20)
|
||||||
EXPECTED_DECODED_TEXT = "USER: \nWhy is this video funny? ASSISTANT: The video is funny because it shows a baby sitting on a bed and reading a book, which" # fmt: skip
|
EXPECTED_DECODED_TEXT = "USER: \nWhy is this video funny? ASSISTANT: The video is funny because it shows a baby sitting on a bed and reading a book, which" # fmt: skip
|
||||||
@@ -487,7 +470,9 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||||
image = Image.open(requests.get(url, stream=True).raw)
|
image = Image.open(requests.get(url, stream=True).raw)
|
||||||
|
|
||||||
inputs = self.processor(prompts, images=[image], videos=[video_file], padding=True, return_tensors="pt")
|
inputs = self.processor(prompts, images=[image], videos=[video_file], padding=True, return_tensors="pt").to(
|
||||||
|
torch_device
|
||||||
|
)
|
||||||
output = model.generate(**inputs, do_sample=False, max_new_tokens=20)
|
output = model.generate(**inputs, do_sample=False, max_new_tokens=20)
|
||||||
|
|
||||||
EXPECTED_DECODED_TEXT = [
|
EXPECTED_DECODED_TEXT = [
|
||||||
@@ -543,7 +528,7 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="video_demo_2.npy", repo_type="dataset")
|
hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="video_demo_2.npy", repo_type="dataset")
|
||||||
)
|
)
|
||||||
|
|
||||||
inputs = processor(prompts, videos=[video_1, video_2], return_tensors="pt", padding=True)
|
inputs = processor(prompts, videos=[video_1, video_2], return_tensors="pt", padding=True).to(torch_device)
|
||||||
|
|
||||||
output = model.generate(**inputs, max_new_tokens=20)
|
output = model.generate(**inputs, max_new_tokens=20)
|
||||||
|
|
||||||
@@ -583,96 +568,16 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
# This is a reproducer of https://github.com/huggingface/transformers/pull/28333 and makes sure it does not happen anymore
|
# This is a reproducer of https://github.com/huggingface/transformers/pull/28333 and makes sure it does not happen anymore
|
||||||
model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", load_in_4bit=True)
|
model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", load_in_4bit=True)
|
||||||
|
|
||||||
# Simulate some user inputs
|
prompt = "USER: <video>\nDescribe the video:? ASSISTANT:"
|
||||||
pixel_values_videos = torch.randn(
|
|
||||||
(1, 8, 3, 224, 224),
|
|
||||||
dtype=torch.float,
|
|
||||||
device=torch_device,
|
|
||||||
)
|
|
||||||
# fmt: off
|
|
||||||
input_ids = torch.tensor(
|
|
||||||
[[32002, 32002, 1, 15043, 7084, 32001, 29871, 13, 7900]],
|
|
||||||
dtype=torch.long,
|
|
||||||
device=torch_device,
|
|
||||||
)
|
|
||||||
# fmt: on
|
|
||||||
attention_mask = torch.tensor(
|
|
||||||
[[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
|
|
||||||
dtype=torch.long,
|
|
||||||
device=torch_device,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Make sure that the loss is properly computed
|
|
||||||
loss = model(
|
|
||||||
pixel_values_videos=pixel_values_videos,
|
|
||||||
input_ids=input_ids,
|
|
||||||
attention_mask=attention_mask,
|
|
||||||
labels=input_ids,
|
|
||||||
).loss
|
|
||||||
loss.backward()
|
|
||||||
|
|
||||||
@slow
|
|
||||||
@require_bitsandbytes
|
|
||||||
def test_expansion_in_processing_images(self):
|
|
||||||
model_id = "LanguageBind/Video-LLaVA-7B-hf"
|
|
||||||
model = VideoLlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
|
|
||||||
processor = VideoLlavaProcessor.from_pretrained(model_id)
|
|
||||||
|
|
||||||
prompt = "USER: <image>\nDescribe the image in details. ASSISTANT:"
|
|
||||||
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
||||||
image = Image.open(requests.get(url, stream=True).raw)
|
|
||||||
|
|
||||||
# check processing with expansion of inputs
|
|
||||||
processor.vision_feature_select_strategy = "default"
|
|
||||||
processor.patch_size = 14
|
|
||||||
processor.num_additional_image_tokens = 1
|
|
||||||
inputs_expanded = processor(prompt, images=image, return_tensors="pt").to(torch_device, torch.float16)
|
|
||||||
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 274)
|
|
||||||
|
|
||||||
# check processing without expansion of inputs (legacy behavior)
|
|
||||||
processor.vision_feature_select_strategy = None
|
|
||||||
processor.patch_size = None
|
|
||||||
processor.num_additional_image_tokens = None
|
|
||||||
inputs = processor(prompt, images=image, return_tensors="pt").to(torch_device, torch.float16)
|
|
||||||
self.assertTrue(inputs.input_ids.shape[-1] == 19)
|
|
||||||
|
|
||||||
# generate exactly 20 tokens
|
|
||||||
output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
|
|
||||||
output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
|
|
||||||
|
|
||||||
# check that both inputs are handled correctly and generate the same output
|
|
||||||
self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
|
|
||||||
|
|
||||||
@slow
|
|
||||||
@require_bitsandbytes
|
|
||||||
def test_expansion_in_processing(self):
|
|
||||||
model_id = "LanguageBind/Video-LLaVA-7B-hf"
|
|
||||||
model = VideoLlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
|
|
||||||
processor = VideoLlavaProcessor.from_pretrained(model_id)
|
|
||||||
|
|
||||||
prompt = "USER: <video>\nDescribe the video in details. ASSISTANT:"
|
|
||||||
video_file = hf_hub_download(
|
video_file = hf_hub_download(
|
||||||
repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
|
repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
|
||||||
)
|
)
|
||||||
video_file = np.load(video_file)
|
video_file = np.load(video_file)
|
||||||
|
inputs = self.processor(prompt, videos=video_file, return_tensors="pt").to(torch_device, torch.float16)
|
||||||
|
|
||||||
# check processing with expansion of inputs
|
# Make sure that the loss is properly computed
|
||||||
processor.vision_feature_select_strategy = "default"
|
loss = model(
|
||||||
processor.patch_size = 14
|
**inputs,
|
||||||
processor.num_additional_image_tokens = 1
|
labels=inputs.input_ids.clone(),
|
||||||
inputs_expanded = processor(prompt, videos=video_file, return_tensors="pt").to(torch_device, torch.float16)
|
).loss
|
||||||
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2074)
|
loss.backward()
|
||||||
|
|
||||||
# check processing without expansion of inputs (legacy behavior)
|
|
||||||
processor.vision_feature_select_strategy = None
|
|
||||||
processor.patch_size = None
|
|
||||||
processor.num_additional_image_tokens = None
|
|
||||||
inputs = processor(prompt, videos=video_file, return_tensors="pt").to(torch_device, torch.float16)
|
|
||||||
self.assertTrue(inputs.input_ids.shape[-1] == 19)
|
|
||||||
|
|
||||||
# generate exactly 20 tokens
|
|
||||||
output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
|
|
||||||
output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
|
|
||||||
|
|
||||||
# check that both inputs are handled correctly and generate the same output
|
|
||||||
self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
|
|
||||||
|
|||||||
@@ -320,7 +320,7 @@ class VipLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
|
|
||||||
outputs = model.generate(**inputs, max_new_tokens=10)
|
outputs = model.generate(**inputs, max_new_tokens=10)
|
||||||
|
|
||||||
EXPECTED_OUTPUT = "USER: <image> \nCan you please describe this image?\nASSISTANT: The image features a brown and white cat sitting on"
|
EXPECTED_OUTPUT = "USER: \nCan you please describe this image?\nASSISTANT: The image features a brown and white cat sitting on"
|
||||||
self.assertEqual(processor.decode(outputs[0], skip_special_tokens=True), EXPECTED_OUTPUT)
|
self.assertEqual(processor.decode(outputs[0], skip_special_tokens=True), EXPECTED_OUTPUT)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
@@ -329,63 +329,17 @@ class VipLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
# This is a reproducer of https://github.com/huggingface/transformers/pull/28333 and makes sure it does not happen anymore
|
# This is a reproducer of https://github.com/huggingface/transformers/pull/28333 and makes sure it does not happen anymore
|
||||||
model_id = "llava-hf/vip-llava-7b-hf"
|
model_id = "llava-hf/vip-llava-7b-hf"
|
||||||
model = VipLlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
|
model = VipLlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
|
||||||
|
processor = AutoProcessor.from_pretrained(model_id)
|
||||||
|
|
||||||
# Simulate some user inputs
|
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
|
||||||
pixel_values = torch.randn(
|
image = Image.open(requests.get(url, stream=True).raw)
|
||||||
(1, 3, 336, 336),
|
prompt = "USER: <image>\nCan you please describe this image?\nASSISTANT:"
|
||||||
dtype=torch.float,
|
|
||||||
device=torch_device,
|
inputs = processor(prompt, image, return_tensors="pt").to(torch_device, torch.float16)
|
||||||
)
|
|
||||||
input_ids = torch.tensor(
|
|
||||||
[
|
|
||||||
[32001, 32001, 1, 15043, 7084, 32000, 29871, 13, 7900],
|
|
||||||
],
|
|
||||||
dtype=torch.long,
|
|
||||||
device=torch_device,
|
|
||||||
)
|
|
||||||
attention_mask = torch.tensor(
|
|
||||||
[[0, 0, 1, 1, 1, 1, 1, 1, 1]],
|
|
||||||
dtype=torch.long,
|
|
||||||
device=torch_device,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Make sure that the loss is properly computed
|
# Make sure that the loss is properly computed
|
||||||
loss = model(
|
loss = model(
|
||||||
pixel_values=pixel_values,
|
**inputs,
|
||||||
input_ids=input_ids,
|
labels=inputs.input_ids.clone(),
|
||||||
attention_mask=attention_mask,
|
|
||||||
labels=input_ids,
|
|
||||||
).loss
|
).loss
|
||||||
loss.backward()
|
loss.backward()
|
||||||
|
|
||||||
@slow
|
|
||||||
@require_bitsandbytes
|
|
||||||
def test_expansion_in_processing(self):
|
|
||||||
model_id = "llava-hf/vip-llava-7b-hf"
|
|
||||||
model = VipLlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
|
|
||||||
processor = AutoProcessor.from_pretrained(model_id)
|
|
||||||
|
|
||||||
prompt = "USER: <image>\nDescribe the image:\nASSISTANT:"
|
|
||||||
image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
||||||
raw_image = Image.open(requests.get(image_file, stream=True).raw)
|
|
||||||
|
|
||||||
# check processing with expansion of inputs
|
|
||||||
processor.vision_feature_select_strategy = "default"
|
|
||||||
processor.patch_size = 14
|
|
||||||
processor.num_additional_image_tokens = 1
|
|
||||||
inputs_expanded = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
|
|
||||||
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 593)
|
|
||||||
|
|
||||||
# check processing without expansion of inputs (legacy behavior)
|
|
||||||
processor.vision_feature_select_strategy = None
|
|
||||||
processor.patch_size = None
|
|
||||||
processor.num_additional_image_tokens = None
|
|
||||||
inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
|
|
||||||
self.assertTrue(inputs.input_ids.shape[-1] == 18)
|
|
||||||
|
|
||||||
# generate exactly 20 tokens
|
|
||||||
output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
|
|
||||||
output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
|
|
||||||
|
|
||||||
# check that both inputs are handled correctly and generate the same output
|
|
||||||
self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
|
|
||||||
|
|||||||
@@ -174,8 +174,9 @@ class ProcessorTesterMixin:
|
|||||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||||
processor_components = self.prepare_components()
|
processor_components = self.prepare_components()
|
||||||
processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
|
processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
|
||||||
|
processor_kwargs = self.prepare_processor_dict()
|
||||||
|
|
||||||
processor = self.processor_class(**processor_components)
|
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||||
self.skip_processor_without_typed_kwargs(processor)
|
self.skip_processor_without_typed_kwargs(processor)
|
||||||
input_str = self.prepare_text_inputs()
|
input_str = self.prepare_text_inputs()
|
||||||
image_input = self.prepare_image_inputs()
|
image_input = self.prepare_image_inputs()
|
||||||
@@ -195,8 +196,9 @@ class ProcessorTesterMixin:
|
|||||||
"image_processor", do_rescale=True, rescale_factor=-1
|
"image_processor", do_rescale=True, rescale_factor=-1
|
||||||
)
|
)
|
||||||
processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
|
processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
|
||||||
|
processor_kwargs = self.prepare_processor_dict()
|
||||||
|
|
||||||
processor = self.processor_class(**processor_components)
|
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||||
self.skip_processor_without_typed_kwargs(processor)
|
self.skip_processor_without_typed_kwargs(processor)
|
||||||
|
|
||||||
input_str = self.prepare_text_inputs()
|
input_str = self.prepare_text_inputs()
|
||||||
@@ -210,8 +212,9 @@ class ProcessorTesterMixin:
|
|||||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||||
processor_components = self.prepare_components()
|
processor_components = self.prepare_components()
|
||||||
processor_components["tokenizer"] = self.get_component("tokenizer", padding="longest")
|
processor_components["tokenizer"] = self.get_component("tokenizer", padding="longest")
|
||||||
|
processor_kwargs = self.prepare_processor_dict()
|
||||||
|
|
||||||
processor = self.processor_class(**processor_components)
|
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||||
self.skip_processor_without_typed_kwargs(processor)
|
self.skip_processor_without_typed_kwargs(processor)
|
||||||
input_str = self.prepare_text_inputs()
|
input_str = self.prepare_text_inputs()
|
||||||
image_input = self.prepare_image_inputs()
|
image_input = self.prepare_image_inputs()
|
||||||
@@ -228,8 +231,9 @@ class ProcessorTesterMixin:
|
|||||||
"image_processor", do_rescale=True, rescale_factor=1
|
"image_processor", do_rescale=True, rescale_factor=1
|
||||||
)
|
)
|
||||||
processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
|
processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
|
||||||
|
processor_kwargs = self.prepare_processor_dict()
|
||||||
|
|
||||||
processor = self.processor_class(**processor_components)
|
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||||
self.skip_processor_without_typed_kwargs(processor)
|
self.skip_processor_without_typed_kwargs(processor)
|
||||||
|
|
||||||
input_str = self.prepare_text_inputs()
|
input_str = self.prepare_text_inputs()
|
||||||
@@ -242,7 +246,8 @@ class ProcessorTesterMixin:
|
|||||||
if "image_processor" not in self.processor_class.attributes:
|
if "image_processor" not in self.processor_class.attributes:
|
||||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||||
processor_components = self.prepare_components()
|
processor_components = self.prepare_components()
|
||||||
processor = self.processor_class(**processor_components)
|
processor_kwargs = self.prepare_processor_dict()
|
||||||
|
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||||
self.skip_processor_without_typed_kwargs(processor)
|
self.skip_processor_without_typed_kwargs(processor)
|
||||||
|
|
||||||
input_str = self.prepare_text_inputs()
|
input_str = self.prepare_text_inputs()
|
||||||
@@ -264,7 +269,8 @@ class ProcessorTesterMixin:
|
|||||||
if "image_processor" not in self.processor_class.attributes:
|
if "image_processor" not in self.processor_class.attributes:
|
||||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||||
processor_components = self.prepare_components()
|
processor_components = self.prepare_components()
|
||||||
processor = self.processor_class(**processor_components)
|
processor_kwargs = self.prepare_processor_dict()
|
||||||
|
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||||
self.skip_processor_without_typed_kwargs(processor)
|
self.skip_processor_without_typed_kwargs(processor)
|
||||||
|
|
||||||
input_str = self.prepare_text_inputs(batch_size=2)
|
input_str = self.prepare_text_inputs(batch_size=2)
|
||||||
@@ -289,7 +295,8 @@ class ProcessorTesterMixin:
|
|||||||
if "image_processor" not in self.processor_class.attributes:
|
if "image_processor" not in self.processor_class.attributes:
|
||||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||||
processor_components = self.prepare_components()
|
processor_components = self.prepare_components()
|
||||||
processor = self.processor_class(**processor_components)
|
processor_kwargs = self.prepare_processor_dict()
|
||||||
|
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||||
self.skip_processor_without_typed_kwargs(processor)
|
self.skip_processor_without_typed_kwargs(processor)
|
||||||
|
|
||||||
input_str = [self.prepare_text_inputs()]
|
input_str = [self.prepare_text_inputs()]
|
||||||
@@ -307,7 +314,8 @@ class ProcessorTesterMixin:
|
|||||||
if "image_processor" not in self.processor_class.attributes:
|
if "image_processor" not in self.processor_class.attributes:
|
||||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||||
processor_components = self.prepare_components()
|
processor_components = self.prepare_components()
|
||||||
processor = self.processor_class(**processor_components)
|
processor_kwargs = self.prepare_processor_dict()
|
||||||
|
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||||
self.skip_processor_without_typed_kwargs(processor)
|
self.skip_processor_without_typed_kwargs(processor)
|
||||||
|
|
||||||
input_str = self.prepare_text_inputs()
|
input_str = self.prepare_text_inputs()
|
||||||
@@ -330,7 +338,8 @@ class ProcessorTesterMixin:
|
|||||||
if "image_processor" not in self.processor_class.attributes:
|
if "image_processor" not in self.processor_class.attributes:
|
||||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||||
processor_components = self.prepare_components()
|
processor_components = self.prepare_components()
|
||||||
processor = self.processor_class(**processor_components)
|
processor_kwargs = self.prepare_processor_dict()
|
||||||
|
processor = self.processor_class(**processor_components, **processor_kwargs)
|
||||||
self.skip_processor_without_typed_kwargs(processor)
|
self.skip_processor_without_typed_kwargs(processor)
|
||||||
input_str = self.prepare_text_inputs()
|
input_str = self.prepare_text_inputs()
|
||||||
image_input = self.prepare_image_inputs()
|
image_input = self.prepare_image_inputs()
|
||||||
|
|||||||
@@ -299,6 +299,9 @@ def check_attribute_being_used(config_class, attributes, default_value, source_s
|
|||||||
"unk_index",
|
"unk_index",
|
||||||
"mask_index",
|
"mask_index",
|
||||||
"image_token_index", # for VLMs
|
"image_token_index", # for VLMs
|
||||||
|
"video_token_index",
|
||||||
|
"image_seq_length",
|
||||||
|
"video_seq_length",
|
||||||
"image_size",
|
"image_size",
|
||||||
"use_cache",
|
"use_cache",
|
||||||
"out_features",
|
"out_features",
|
||||||
|
|||||||
Reference in New Issue
Block a user