BLIP: enable generation tests (#34174)
* blip2 tests * instructblips * copies * fix slow tests * fix * uncomment this * clean up after rebase * should be model main input * fix overwritten tests * oops len should be multiple of frame number * style * fix some tests
This commit is contained in:
committed by
GitHub
parent
6beb3f1691
commit
4cc0813e28
@@ -20,7 +20,9 @@ import tempfile
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import requests
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import CONFIG_MAPPING, Blip2Config, Blip2QFormerConfig, Blip2VisionConfig
|
||||
from transformers.testing_utils import (
|
||||
@@ -392,7 +394,14 @@ class Blip2TextModelDecoderOnlyTester:
|
||||
# this model tester uses a decoder-only language model (OPT)
|
||||
class Blip2ForConditionalGenerationDecoderOnlyModelTester:
|
||||
def __init__(
|
||||
self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10
|
||||
self,
|
||||
parent,
|
||||
vision_kwargs=None,
|
||||
qformer_kwargs=None,
|
||||
text_kwargs=None,
|
||||
is_training=True,
|
||||
num_query_tokens=10,
|
||||
image_token_index=4,
|
||||
):
|
||||
if vision_kwargs is None:
|
||||
vision_kwargs = {}
|
||||
@@ -406,14 +415,24 @@ class Blip2ForConditionalGenerationDecoderOnlyModelTester:
|
||||
self.qformer_model_tester = Blip2QFormerModelTester(parent, **qformer_kwargs)
|
||||
self.text_model_tester = Blip2TextModelDecoderOnlyTester(parent, **text_kwargs)
|
||||
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||
self.seq_length = self.text_model_tester.seq_length # need seq_length for common tests
|
||||
self.seq_length = self.text_model_tester.seq_length + num_query_tokens # need seq_length for common tests
|
||||
self.is_training = is_training
|
||||
self.num_query_tokens = num_query_tokens
|
||||
self.image_token_index = image_token_index
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
_, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
|
||||
_, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
|
||||
|
||||
vision_tokens = (
|
||||
torch.ones((input_ids.shape[0], self.num_query_tokens), device=torch_device, dtype=input_ids.dtype)
|
||||
* self.image_token_index
|
||||
)
|
||||
input_ids[input_ids == self.image_token_index] = self.text_model_tester.pad_token_id
|
||||
input_ids = torch.cat([vision_tokens, input_ids], dim=-1)
|
||||
vision_attention_mask = torch.ones_like(vision_tokens)
|
||||
attention_mask = torch.cat([vision_attention_mask, attention_mask], dim=-1)
|
||||
|
||||
config = self.get_config()
|
||||
|
||||
return config, input_ids, attention_mask, pixel_values
|
||||
@@ -424,6 +443,7 @@ class Blip2ForConditionalGenerationDecoderOnlyModelTester:
|
||||
qformer_config=self.qformer_model_tester.get_config(),
|
||||
text_config=self.text_model_tester.get_config(),
|
||||
num_query_tokens=self.num_query_tokens,
|
||||
image_token_index=self.image_token_index,
|
||||
)
|
||||
|
||||
def create_and_check_for_conditional_generation(self, config, input_ids, attention_mask, pixel_values):
|
||||
@@ -451,6 +471,7 @@ class Blip2ForConditionalGenerationDecoderOnlyModelTester:
|
||||
@require_torch
|
||||
class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (Blip2ForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_generative_model_classes = (Blip2ForConditionalGeneration,) if is_torch_available() else ()
|
||||
fx_compatible = False
|
||||
test_head_masking = False
|
||||
test_pruning = False
|
||||
@@ -693,6 +714,192 @@ class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationT
|
||||
model = Blip2ForConditionalGeneration.from_pretrained(model_name)
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
# overwrite because BLIP internally calls LM.generate() with embeds thus it cannot operate in no cache format
|
||||
def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1):
|
||||
use_cache = True # force this to be True in case False is passed
|
||||
|
||||
input_batch_size = int(output.sequences.shape[0] / num_return_sequences)
|
||||
internal_batch_size = (
|
||||
input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences
|
||||
)
|
||||
|
||||
seq_length = getattr(self.model_tester, "seq_length", None)
|
||||
seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length)
|
||||
seq_length = getattr(self.model_tester, "text_seq_length", seq_length)
|
||||
|
||||
config = config.text_config if hasattr(config, "text_config") else config
|
||||
|
||||
gen_len = (
|
||||
output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
|
||||
)
|
||||
|
||||
# in some models we subsample the sequence length in inner layers
|
||||
if hasattr(self.model_tester, "get_subsampled_output_lengths"):
|
||||
seq_length = self.model_tester.get_subsampled_output_lengths(seq_length)
|
||||
|
||||
# scores
|
||||
self._check_scores(internal_batch_size, output.scores, length=gen_len, config=config)
|
||||
|
||||
# unprocessed logits
|
||||
self._check_logits(internal_batch_size, output.logits, config=config)
|
||||
|
||||
# Attentions
|
||||
if self.has_attentions:
|
||||
if config.is_encoder_decoder:
|
||||
# encoder
|
||||
self._check_encoder_attention_for_generate(
|
||||
output.encoder_attentions, input_batch_size, config, seq_length
|
||||
)
|
||||
# decoder
|
||||
self._check_attentions_for_generate(
|
||||
internal_batch_size,
|
||||
output.decoder_attentions,
|
||||
min_length=1,
|
||||
max_length=output.sequences.shape[-1],
|
||||
config=config,
|
||||
use_cache=use_cache,
|
||||
)
|
||||
else:
|
||||
# if use_cache first input is equal to no use_cache, so skip here
|
||||
attentions = output.attentions if not use_cache else output.attentions[1:]
|
||||
min_length = seq_length if not use_cache else seq_length + 1
|
||||
self._check_attentions_for_generate(
|
||||
internal_batch_size,
|
||||
attentions=attentions,
|
||||
min_length=min_length,
|
||||
max_length=output.sequences.shape[-1],
|
||||
config=config,
|
||||
use_cache=use_cache,
|
||||
)
|
||||
|
||||
# Hidden States
|
||||
if config.is_encoder_decoder:
|
||||
# encoder
|
||||
self._check_encoder_hidden_states_for_generate(
|
||||
output.encoder_hidden_states, input_batch_size, config, seq_length
|
||||
)
|
||||
|
||||
# decoder
|
||||
self._check_hidden_states_for_generate(
|
||||
internal_batch_size,
|
||||
output.decoder_hidden_states,
|
||||
min_length=1,
|
||||
max_length=output.sequences.shape[-1],
|
||||
config=config,
|
||||
use_cache=use_cache,
|
||||
)
|
||||
else:
|
||||
# if use_cache first input is equal to no use_cache, so skip here
|
||||
hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:]
|
||||
min_length = seq_length if not use_cache else seq_length + 1
|
||||
self._check_hidden_states_for_generate(
|
||||
internal_batch_size,
|
||||
hidden_states,
|
||||
min_length=min_length,
|
||||
max_length=output.sequences.shape[-1],
|
||||
config=config,
|
||||
use_cache=use_cache,
|
||||
)
|
||||
|
||||
# Past Key Value States
|
||||
if use_cache:
|
||||
past_key_values = output.past_key_values
|
||||
past_sequence_length = output.sequences.shape[-1] - 1
|
||||
self._check_past_key_values_for_generate(
|
||||
internal_batch_size,
|
||||
past_key_values,
|
||||
seq_length=past_sequence_length,
|
||||
config=config,
|
||||
)
|
||||
|
||||
# overwrite because BLIP2 cannot generate only from input ids, and requires pixel values in all cases to be present
|
||||
@pytest.mark.generate
|
||||
def test_left_padding_compatibility(self):
|
||||
# NOTE: left-padding results in small numerical differences. This is expected.
|
||||
# See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535
|
||||
|
||||
# First, filter out models that don't support left padding
|
||||
# - The model must have generative capabilities
|
||||
if len(self.all_generative_model_classes) == 0:
|
||||
self.skipTest(reason="No generative architecture available for this model.")
|
||||
|
||||
# - The model must support padding
|
||||
if not self.has_attentions:
|
||||
self.skipTest(reason="This model doesn't support padding.")
|
||||
|
||||
# - The model must be a decoder-only architecture (encoder-based architectures use right-padding)
|
||||
decoder_only_classes = []
|
||||
for model_class in self.all_generative_model_classes:
|
||||
config, _ = self.prepare_config_and_inputs_for_generate()
|
||||
if config.is_encoder_decoder:
|
||||
continue
|
||||
else:
|
||||
decoder_only_classes.append(model_class)
|
||||
if len(decoder_only_classes) == 0:
|
||||
self.skipTest(reason="No decoder-only architecture available for this model.")
|
||||
|
||||
# - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't
|
||||
# added support for it yet. We skip these models for now.
|
||||
has_encoder_attributes = any(
|
||||
attr_name
|
||||
for attr_name in config.to_dict().keys()
|
||||
if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size"
|
||||
)
|
||||
if has_encoder_attributes:
|
||||
self.skipTest(
|
||||
reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding."
|
||||
)
|
||||
|
||||
# Then, test left-padding
|
||||
def _prepare_model_kwargs(input_ids, attention_mask, signature):
|
||||
model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask}
|
||||
if "position_ids" in signature:
|
||||
position_ids = torch.cumsum(attention_mask, dim=-1) - 1
|
||||
position_ids.masked_fill_(attention_mask == 0, 1)
|
||||
model_kwargs["position_ids"] = position_ids
|
||||
if "cache_position" in signature:
|
||||
cache_position = torch.arange(input_ids.shape[-1], device=torch_device)
|
||||
model_kwargs["cache_position"] = cache_position
|
||||
return model_kwargs
|
||||
|
||||
for model_class in decoder_only_classes:
|
||||
config, inputs_dict = self.prepare_config_and_inputs_for_generate()
|
||||
input_ids = inputs_dict["input_ids"]
|
||||
attention_mask = inputs_dict.get("attention_mask")
|
||||
pixel_values = inputs_dict["pixel_values"]
|
||||
if attention_mask is None:
|
||||
attention_mask = torch.ones_like(input_ids)
|
||||
|
||||
model = model_class(config).to(torch_device).eval()
|
||||
signature = inspect.signature(model.forward).parameters.keys()
|
||||
|
||||
# no cache as some models require special cache classes to be init outside forward
|
||||
model.generation_config.use_cache = False
|
||||
|
||||
# Without padding
|
||||
model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature)
|
||||
next_logits_wo_padding = model(**model_kwargs, pixel_values=pixel_values).logits[:, -1, :]
|
||||
|
||||
# With left-padding (length 32)
|
||||
# can hardcode pad_token to be 0 as we'll do attn masking anyway
|
||||
pad_token_id = (
|
||||
config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0
|
||||
)
|
||||
pad_size = (input_ids.shape[0], 32)
|
||||
padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id
|
||||
padded_input_ids = torch.cat((padding, input_ids), dim=1)
|
||||
padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1)
|
||||
model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature)
|
||||
next_logits_with_padding = model(**model_kwargs, pixel_values=pixel_values).logits[:, -1, :]
|
||||
|
||||
# They should result in very similar logits
|
||||
self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5))
|
||||
|
||||
@unittest.skip("BLIP2 cannot generate only from input ids, and requires pixel values in all cases to be present")
|
||||
@parameterized.expand([("greedy", 1), ("beam search", 2)])
|
||||
def test_generate_from_inputs_embeds(self, _, num_beams):
|
||||
pass
|
||||
|
||||
|
||||
# this class is based on `T5ModelTester` found in tests/models/t5/test_modeling_t5.py
|
||||
class Blip2TextModelTester:
|
||||
@@ -1780,6 +1987,7 @@ class Blip2ModelIntegrationTest(unittest.TestCase):
|
||||
generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
|
||||
|
||||
# Test output
|
||||
print(predictions[0].tolist(), generated_text)
|
||||
self.assertEqual(predictions[0].tolist(), [2, 102, 693, 2828, 15, 5, 4105, 19, 10, 2335, 50118])
|
||||
self.assertEqual("a woman sitting on the beach with a dog", generated_text)
|
||||
|
||||
@@ -1794,9 +2002,9 @@ class Blip2ModelIntegrationTest(unittest.TestCase):
|
||||
# Test output
|
||||
self.assertEqual(
|
||||
predictions[0].tolist(),
|
||||
[2, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118],
|
||||
[2, 45641, 35, 61, 343, 16, 42, 116, 31652, 35, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118],
|
||||
)
|
||||
self.assertEqual(generated_text, "it's not a city, it's a beach")
|
||||
self.assertEqual(generated_text, "Question: which city is this? Answer: it's not a city, it's a beach")
|
||||
|
||||
def test_inference_interpolate_pos_encoding(self):
|
||||
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
|
||||
@@ -1905,9 +2113,9 @@ class Blip2ModelIntegrationTest(unittest.TestCase):
|
||||
# Test output
|
||||
self.assertEqual(
|
||||
predictions[0].tolist(),
|
||||
[2, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118],
|
||||
[2, 45641, 35, 61, 343, 16, 42, 116, 31652, 35, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118],
|
||||
)
|
||||
self.assertEqual(generated_text, "it's not a city, it's a beach")
|
||||
self.assertEqual(generated_text, "Question: which city is this? Answer: it's not a city, it's a beach")
|
||||
|
||||
@require_torch_multi_accelerator
|
||||
def test_inference_t5_multi_accelerator(self):
|
||||
|
||||
@@ -19,7 +19,9 @@ import tempfile
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import requests
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import (
|
||||
CONFIG_MAPPING,
|
||||
@@ -320,7 +322,7 @@ class InstructBlipTextModelDecoderOnlyTester:
|
||||
hidden_act="gelu",
|
||||
hidden_dropout_prob=0.1,
|
||||
attention_probs_dropout_prob=0.1,
|
||||
max_position_embeddings=20,
|
||||
max_position_embeddings=100,
|
||||
eos_token_id=2,
|
||||
pad_token_id=1,
|
||||
bos_token_id=0,
|
||||
@@ -384,7 +386,14 @@ class InstructBlipTextModelDecoderOnlyTester:
|
||||
# this model tester uses a decoder-only language model (OPT)
|
||||
class InstructBlipForConditionalGenerationDecoderOnlyModelTester:
|
||||
def __init__(
|
||||
self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10
|
||||
self,
|
||||
parent,
|
||||
vision_kwargs=None,
|
||||
qformer_kwargs=None,
|
||||
text_kwargs=None,
|
||||
is_training=True,
|
||||
num_query_tokens=10,
|
||||
image_token_index=4,
|
||||
):
|
||||
if vision_kwargs is None:
|
||||
vision_kwargs = {}
|
||||
@@ -398,9 +407,10 @@ class InstructBlipForConditionalGenerationDecoderOnlyModelTester:
|
||||
self.qformer_model_tester = InstructBlipQFormerModelTester(parent, **qformer_kwargs)
|
||||
self.text_model_tester = InstructBlipTextModelDecoderOnlyTester(parent, **text_kwargs)
|
||||
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||
self.seq_length = self.text_model_tester.seq_length # need seq_length for common tests
|
||||
self.seq_length = self.text_model_tester.seq_length + num_query_tokens # need seq_length for common tests
|
||||
self.is_training = is_training
|
||||
self.num_query_tokens = num_query_tokens
|
||||
self.image_token_index = image_token_index
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
_, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
|
||||
@@ -408,6 +418,14 @@ class InstructBlipForConditionalGenerationDecoderOnlyModelTester:
|
||||
_, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
|
||||
|
||||
config = self.get_config()
|
||||
vision_tokens = (
|
||||
torch.ones((input_ids.shape[0], self.num_query_tokens), device=torch_device, dtype=input_ids.dtype)
|
||||
* self.image_token_index
|
||||
)
|
||||
input_ids[input_ids == self.image_token_index] = self.text_model_tester.pad_token_id
|
||||
input_ids = torch.cat([vision_tokens, input_ids], dim=-1)
|
||||
vision_attention_mask = torch.ones_like(vision_tokens)
|
||||
attention_mask = torch.cat([vision_attention_mask, attention_mask], dim=-1)
|
||||
|
||||
return config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values
|
||||
|
||||
@@ -417,6 +435,7 @@ class InstructBlipForConditionalGenerationDecoderOnlyModelTester:
|
||||
qformer_config=self.qformer_model_tester.get_config(),
|
||||
text_config=self.text_model_tester.get_config(),
|
||||
num_query_tokens=self.num_query_tokens,
|
||||
image_token_index=self.image_token_index,
|
||||
)
|
||||
|
||||
def create_and_check_for_conditional_generation(
|
||||
@@ -455,6 +474,7 @@ class InstructBlipForConditionalGenerationDecoderOnlyModelTester:
|
||||
@require_torch
|
||||
class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_generative_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else ()
|
||||
pipeline_model_mapping = {"image-text-to-text": InstructBlipForConditionalGeneration}
|
||||
fx_compatible = False
|
||||
test_head_masking = False
|
||||
@@ -532,6 +552,199 @@ class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, Gene
|
||||
model = InstructBlipForConditionalGeneration.from_pretrained(model_name)
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
# overwrite because InstructBLIP internally calls LM.generate() with embeds thus it cannot operate in no cache format
|
||||
def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1):
|
||||
use_cache = True # force this to be True in case False is passed
|
||||
|
||||
input_batch_size = int(output.sequences.shape[0] / num_return_sequences)
|
||||
internal_batch_size = (
|
||||
input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences
|
||||
)
|
||||
|
||||
seq_length = getattr(self.model_tester, "seq_length", None)
|
||||
seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length)
|
||||
seq_length = getattr(self.model_tester, "text_seq_length", seq_length)
|
||||
|
||||
config = config.text_config if hasattr(config, "text_config") else config
|
||||
|
||||
gen_len = (
|
||||
output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
|
||||
)
|
||||
|
||||
# in some models we subsample the sequence length in inner layers
|
||||
if hasattr(self.model_tester, "get_subsampled_output_lengths"):
|
||||
seq_length = self.model_tester.get_subsampled_output_lengths(seq_length)
|
||||
|
||||
# scores
|
||||
self._check_scores(internal_batch_size, output.scores, length=gen_len, config=config)
|
||||
|
||||
# unprocessed logits
|
||||
self._check_logits(internal_batch_size, output.logits, config=config)
|
||||
|
||||
# Attentions
|
||||
if self.has_attentions:
|
||||
if config.is_encoder_decoder:
|
||||
# encoder
|
||||
self._check_encoder_attention_for_generate(
|
||||
output.encoder_attentions, input_batch_size, config, seq_length
|
||||
)
|
||||
# decoder
|
||||
self._check_attentions_for_generate(
|
||||
internal_batch_size,
|
||||
output.decoder_attentions,
|
||||
min_length=1,
|
||||
max_length=output.sequences.shape[-1],
|
||||
config=config,
|
||||
use_cache=use_cache,
|
||||
)
|
||||
else:
|
||||
# if use_cache first input is equal to no use_cache, so skip here
|
||||
attentions = output.attentions if not use_cache else output.attentions[1:]
|
||||
min_length = seq_length if not use_cache else seq_length + 1
|
||||
self._check_attentions_for_generate(
|
||||
internal_batch_size,
|
||||
attentions=attentions,
|
||||
min_length=min_length,
|
||||
max_length=output.sequences.shape[-1],
|
||||
config=config,
|
||||
use_cache=use_cache,
|
||||
)
|
||||
|
||||
# Hidden States
|
||||
if config.is_encoder_decoder:
|
||||
# encoder
|
||||
self._check_encoder_hidden_states_for_generate(
|
||||
output.encoder_hidden_states, input_batch_size, config, seq_length
|
||||
)
|
||||
|
||||
# decoder
|
||||
self._check_hidden_states_for_generate(
|
||||
internal_batch_size,
|
||||
output.decoder_hidden_states,
|
||||
min_length=1,
|
||||
max_length=output.sequences.shape[-1],
|
||||
config=config,
|
||||
use_cache=use_cache,
|
||||
)
|
||||
else:
|
||||
# if use_cache first input is equal to no use_cache, so skip here
|
||||
hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:]
|
||||
min_length = seq_length if not use_cache else seq_length + 1
|
||||
self._check_hidden_states_for_generate(
|
||||
internal_batch_size,
|
||||
hidden_states,
|
||||
min_length=min_length,
|
||||
max_length=output.sequences.shape[-1],
|
||||
config=config,
|
||||
use_cache=use_cache,
|
||||
)
|
||||
|
||||
# Past Key Value States
|
||||
if use_cache:
|
||||
past_key_values = output.past_key_values
|
||||
past_sequence_length = output.sequences.shape[-1] - 1
|
||||
self._check_past_key_values_for_generate(
|
||||
internal_batch_size,
|
||||
past_key_values,
|
||||
seq_length=past_sequence_length,
|
||||
config=config,
|
||||
)
|
||||
|
||||
# overwrite because InstructBLIP cannot generate only from input ids, and requires `pixel` values and `qformer_input_ids` in all cases to be present
|
||||
@pytest.mark.generate
|
||||
def test_left_padding_compatibility(self):
|
||||
# NOTE: left-padding results in small numerical differences. This is expected.
|
||||
# See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535
|
||||
|
||||
# First, filter out models that don't support left padding
|
||||
# - The model must have generative capabilities
|
||||
if len(self.all_generative_model_classes) == 0:
|
||||
self.skipTest(reason="No generative architecture available for this model.")
|
||||
|
||||
# - The model must support padding
|
||||
if not self.has_attentions:
|
||||
self.skipTest(reason="This model doesn't support padding.")
|
||||
|
||||
# - The model must be a decoder-only architecture (encoder-based architectures use right-padding)
|
||||
decoder_only_classes = []
|
||||
for model_class in self.all_generative_model_classes:
|
||||
config, _ = self.prepare_config_and_inputs_for_generate()
|
||||
if config.is_encoder_decoder:
|
||||
continue
|
||||
else:
|
||||
decoder_only_classes.append(model_class)
|
||||
if len(decoder_only_classes) == 0:
|
||||
self.skipTest(reason="No decoder-only architecture available for this model.")
|
||||
|
||||
# - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't
|
||||
# added support for it yet. We skip these models for now.
|
||||
has_encoder_attributes = any(
|
||||
attr_name
|
||||
for attr_name in config.to_dict().keys()
|
||||
if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size"
|
||||
)
|
||||
if has_encoder_attributes:
|
||||
self.skipTest(
|
||||
reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding."
|
||||
)
|
||||
|
||||
# Then, test left-padding
|
||||
def _prepare_model_kwargs(input_ids, attention_mask, signature):
|
||||
model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask}
|
||||
if "position_ids" in signature:
|
||||
position_ids = torch.cumsum(attention_mask, dim=-1) - 1
|
||||
position_ids.masked_fill_(attention_mask == 0, 1)
|
||||
model_kwargs["position_ids"] = position_ids
|
||||
if "cache_position" in signature:
|
||||
cache_position = torch.arange(input_ids.shape[-1], device=torch_device)
|
||||
model_kwargs["cache_position"] = cache_position
|
||||
return model_kwargs
|
||||
|
||||
for model_class in decoder_only_classes:
|
||||
config, inputs_dict = self.prepare_config_and_inputs_for_generate()
|
||||
input_ids = inputs_dict["input_ids"]
|
||||
attention_mask = inputs_dict.get("attention_mask")
|
||||
pixel_values = inputs_dict["pixel_values"]
|
||||
qformer_input_ids = inputs_dict["qformer_input_ids"]
|
||||
if attention_mask is None:
|
||||
attention_mask = torch.ones_like(input_ids)
|
||||
|
||||
model = model_class(config).to(torch_device).eval()
|
||||
signature = inspect.signature(model.forward).parameters.keys()
|
||||
|
||||
# no cache as some models require special cache classes to be init outside forward
|
||||
model.generation_config.use_cache = False
|
||||
|
||||
# Without padding
|
||||
model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature)
|
||||
next_logits_wo_padding = model(
|
||||
**model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids
|
||||
).logits[:, -1, :]
|
||||
|
||||
# With left-padding (length 32)
|
||||
# can hardcode pad_token to be 0 as we'll do attn masking anyway
|
||||
pad_token_id = (
|
||||
config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0
|
||||
)
|
||||
pad_size = (input_ids.shape[0], 32)
|
||||
padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id
|
||||
padded_input_ids = torch.cat((padding, input_ids), dim=1)
|
||||
padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1)
|
||||
model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature)
|
||||
next_logits_with_padding = model(
|
||||
**model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids
|
||||
).logits[:, -1, :]
|
||||
|
||||
# They should result in very similar logits
|
||||
self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5))
|
||||
|
||||
@unittest.skip(
|
||||
"InstructBLIP cannot generate only from input ids, and requires pixel values in all cases to be present"
|
||||
)
|
||||
@parameterized.expand([("greedy", 1), ("beam search", 2)])
|
||||
def test_generate_from_inputs_embeds(self, _, num_beams):
|
||||
pass
|
||||
|
||||
@require_torch_sdpa
|
||||
def test_sdpa_can_dispatch_composite_models(self):
|
||||
"""
|
||||
@@ -632,12 +845,12 @@ class InstructBlipModelIntegrationTest(unittest.TestCase):
|
||||
outputs = model.generate(**inputs, max_new_tokens=30)
|
||||
generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
|
||||
|
||||
expected_outputs = [2, 450, 22910, 9565, 310, 445, 1967, 338, 393, 263, 767, 338, 13977, 292, 22095, 373, 278, 1250, 310, 263, 13328, 20134, 29963, 1550, 372, 338, 19500, 1623, 263, 19587, 4272] # fmt: off
|
||||
expected_outputs = [2, 1724, 338, 22910, 1048, 445, 1967, 29973, 450, 22910, 9565, 310, 445, 1967, 338, 393, 263, 767, 338, 13977, 292, 22095, 373, 278, 1250, 310, 263, 13328, 20134, 29963, 1550, 19500, 373, 263, 19587, 4272, 11952, 29889] # fmt: off
|
||||
|
||||
self.assertEqual(outputs[0].tolist(), expected_outputs)
|
||||
self.assertEqual(
|
||||
generated_text,
|
||||
"The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV while it is driving down a busy city",
|
||||
"What is unusual about this image? The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV while driving on a busy city street.",
|
||||
)
|
||||
|
||||
def test_inference_flant5_xl(self):
|
||||
|
||||
@@ -19,7 +19,9 @@ import tempfile
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from huggingface_hub import hf_hub_download
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import (
|
||||
CONFIG_MAPPING,
|
||||
@@ -398,7 +400,14 @@ class InstructBlipVideoTextModelDecoderOnlyTester:
|
||||
# this model tester uses a decoder-only language model (OPT)
|
||||
class InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester:
|
||||
def __init__(
|
||||
self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10
|
||||
self,
|
||||
parent,
|
||||
vision_kwargs=None,
|
||||
qformer_kwargs=None,
|
||||
text_kwargs=None,
|
||||
is_training=True,
|
||||
num_query_tokens=10,
|
||||
video_token_index=4,
|
||||
):
|
||||
if vision_kwargs is None:
|
||||
vision_kwargs = {}
|
||||
@@ -412,17 +421,30 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester:
|
||||
self.qformer_model_tester = InstructBlipVideoQFormerModelTester(parent, **qformer_kwargs)
|
||||
self.text_model_tester = InstructBlipVideoTextModelDecoderOnlyTester(parent, **text_kwargs)
|
||||
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||
self.seq_length = self.text_model_tester.seq_length # need seq_length for common tests
|
||||
self.frames = self.vision_model_tester.frames
|
||||
# need seq_length for common tests
|
||||
self.seq_length = self.text_model_tester.seq_length + (num_query_tokens * self.frames)
|
||||
self.is_training = is_training
|
||||
self.num_query_tokens = num_query_tokens
|
||||
self.video_token_index = video_token_index
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
_, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
|
||||
_, _, _, qformer_input_ids, qformer_attention_mask = self.qformer_model_tester.prepare_config_and_inputs()
|
||||
_, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
|
||||
frames = self.vision_model_tester.frames
|
||||
_, c, h, w = pixel_values.shape
|
||||
pixel_values = pixel_values.reshape(-1, frames, c, h, w)
|
||||
pixel_values = pixel_values.reshape(-1, self.frames, c, h, w)
|
||||
|
||||
vision_tokens = (
|
||||
torch.ones(
|
||||
(input_ids.shape[0], self.num_query_tokens * self.frames), device=torch_device, dtype=input_ids.dtype
|
||||
)
|
||||
* self.video_token_index
|
||||
)
|
||||
input_ids[input_ids == self.video_token_index] = self.text_model_tester.pad_token_id
|
||||
input_ids = torch.cat([vision_tokens, input_ids], dim=-1)
|
||||
vision_attention_mask = torch.ones_like(vision_tokens)
|
||||
attention_mask = torch.cat([vision_attention_mask, attention_mask], dim=-1)
|
||||
|
||||
config = self.get_config()
|
||||
|
||||
@@ -434,6 +456,7 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester:
|
||||
qformer_config=self.qformer_model_tester.get_config(),
|
||||
text_config=self.text_model_tester.get_config(),
|
||||
num_query_tokens=self.num_query_tokens,
|
||||
video_token_index=self.video_token_index,
|
||||
)
|
||||
|
||||
def create_and_check_for_conditional_generation(
|
||||
@@ -476,6 +499,7 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(
|
||||
ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
|
||||
):
|
||||
all_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_generative_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else ()
|
||||
fx_compatible = False
|
||||
test_head_masking = False
|
||||
test_pruning = False
|
||||
@@ -552,6 +576,199 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(
|
||||
model = InstructBlipVideoForConditionalGeneration.from_pretrained(model_name)
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
# overwrite because InstructBLIPVideo internally calls LM.generate() with embeds thus it cannot operate in no cache format
|
||||
def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1):
|
||||
use_cache = True # force this to be True in case False is passed
|
||||
|
||||
input_batch_size = int(output.sequences.shape[0] / num_return_sequences)
|
||||
internal_batch_size = (
|
||||
input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences
|
||||
)
|
||||
|
||||
seq_length = getattr(self.model_tester, "seq_length", None)
|
||||
seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length)
|
||||
seq_length = getattr(self.model_tester, "text_seq_length", seq_length)
|
||||
|
||||
config = config.text_config if hasattr(config, "text_config") else config
|
||||
|
||||
gen_len = (
|
||||
output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
|
||||
)
|
||||
|
||||
# in some models we subsample the sequence length in inner layers
|
||||
if hasattr(self.model_tester, "get_subsampled_output_lengths"):
|
||||
seq_length = self.model_tester.get_subsampled_output_lengths(seq_length)
|
||||
|
||||
# scores
|
||||
self._check_scores(internal_batch_size, output.scores, length=gen_len, config=config)
|
||||
|
||||
# unprocessed logits
|
||||
self._check_logits(internal_batch_size, output.logits, config=config)
|
||||
|
||||
# Attentions
|
||||
if self.has_attentions:
|
||||
if config.is_encoder_decoder:
|
||||
# encoder
|
||||
self._check_encoder_attention_for_generate(
|
||||
output.encoder_attentions, input_batch_size, config, seq_length
|
||||
)
|
||||
# decoder
|
||||
self._check_attentions_for_generate(
|
||||
internal_batch_size,
|
||||
output.decoder_attentions,
|
||||
min_length=1,
|
||||
max_length=output.sequences.shape[-1],
|
||||
config=config,
|
||||
use_cache=use_cache,
|
||||
)
|
||||
else:
|
||||
# if use_cache first input is equal to no use_cache, so skip here
|
||||
attentions = output.attentions if not use_cache else output.attentions[1:]
|
||||
min_length = seq_length if not use_cache else seq_length + 1
|
||||
self._check_attentions_for_generate(
|
||||
internal_batch_size,
|
||||
attentions=attentions,
|
||||
min_length=min_length,
|
||||
max_length=output.sequences.shape[-1],
|
||||
config=config,
|
||||
use_cache=use_cache,
|
||||
)
|
||||
|
||||
# Hidden States
|
||||
if config.is_encoder_decoder:
|
||||
# encoder
|
||||
self._check_encoder_hidden_states_for_generate(
|
||||
output.encoder_hidden_states, input_batch_size, config, seq_length
|
||||
)
|
||||
|
||||
# decoder
|
||||
self._check_hidden_states_for_generate(
|
||||
internal_batch_size,
|
||||
output.decoder_hidden_states,
|
||||
min_length=1,
|
||||
max_length=output.sequences.shape[-1],
|
||||
config=config,
|
||||
use_cache=use_cache,
|
||||
)
|
||||
else:
|
||||
# if use_cache first input is equal to no use_cache, so skip here
|
||||
hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:]
|
||||
min_length = seq_length if not use_cache else seq_length + 1
|
||||
self._check_hidden_states_for_generate(
|
||||
internal_batch_size,
|
||||
hidden_states,
|
||||
min_length=min_length,
|
||||
max_length=output.sequences.shape[-1],
|
||||
config=config,
|
||||
use_cache=use_cache,
|
||||
)
|
||||
|
||||
# Past Key Value States
|
||||
if use_cache:
|
||||
past_key_values = output.past_key_values
|
||||
past_sequence_length = output.sequences.shape[-1] - 1
|
||||
self._check_past_key_values_for_generate(
|
||||
internal_batch_size,
|
||||
past_key_values,
|
||||
seq_length=past_sequence_length,
|
||||
config=config,
|
||||
)
|
||||
|
||||
# overwrite because InstructBLIPVideo cannot generate only from input ids, and requires `pixel` values and `qformer_input_ids` in all cases to be present
|
||||
@pytest.mark.generate
|
||||
def test_left_padding_compatibility(self):
|
||||
# NOTE: left-padding results in small numerical differences. This is expected.
|
||||
# See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535
|
||||
|
||||
# First, filter out models that don't support left padding
|
||||
# - The model must have generative capabilities
|
||||
if len(self.all_generative_model_classes) == 0:
|
||||
self.skipTest(reason="No generative architecture available for this model.")
|
||||
|
||||
# - The model must support padding
|
||||
if not self.has_attentions:
|
||||
self.skipTest(reason="This model doesn't support padding.")
|
||||
|
||||
# - The model must be a decoder-only architecture (encoder-based architectures use right-padding)
|
||||
decoder_only_classes = []
|
||||
for model_class in self.all_generative_model_classes:
|
||||
config, _ = self.prepare_config_and_inputs_for_generate()
|
||||
if config.is_encoder_decoder:
|
||||
continue
|
||||
else:
|
||||
decoder_only_classes.append(model_class)
|
||||
if len(decoder_only_classes) == 0:
|
||||
self.skipTest(reason="No decoder-only architecture available for this model.")
|
||||
|
||||
# - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't
|
||||
# added support for it yet. We skip these models for now.
|
||||
has_encoder_attributes = any(
|
||||
attr_name
|
||||
for attr_name in config.to_dict().keys()
|
||||
if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size"
|
||||
)
|
||||
if has_encoder_attributes:
|
||||
self.skipTest(
|
||||
reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding."
|
||||
)
|
||||
|
||||
# Then, test left-padding
|
||||
def _prepare_model_kwargs(input_ids, attention_mask, signature):
|
||||
model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask}
|
||||
if "position_ids" in signature:
|
||||
position_ids = torch.cumsum(attention_mask, dim=-1) - 1
|
||||
position_ids.masked_fill_(attention_mask == 0, 1)
|
||||
model_kwargs["position_ids"] = position_ids
|
||||
if "cache_position" in signature:
|
||||
cache_position = torch.arange(input_ids.shape[-1], device=torch_device)
|
||||
model_kwargs["cache_position"] = cache_position
|
||||
return model_kwargs
|
||||
|
||||
for model_class in decoder_only_classes:
|
||||
config, inputs_dict = self.prepare_config_and_inputs_for_generate()
|
||||
input_ids = inputs_dict["input_ids"]
|
||||
attention_mask = inputs_dict.get("attention_mask")
|
||||
pixel_values = inputs_dict["pixel_values"]
|
||||
qformer_input_ids = inputs_dict["qformer_input_ids"]
|
||||
if attention_mask is None:
|
||||
attention_mask = torch.ones_like(input_ids)
|
||||
|
||||
model = model_class(config).to(torch_device).eval()
|
||||
signature = inspect.signature(model.forward).parameters.keys()
|
||||
|
||||
# no cache as some models require special cache classes to be init outside forward
|
||||
model.generation_config.use_cache = False
|
||||
|
||||
# Without padding
|
||||
model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature)
|
||||
next_logits_wo_padding = model(
|
||||
**model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids
|
||||
).logits[:, -1, :]
|
||||
|
||||
# With left-padding (length 32)
|
||||
# can hardcode pad_token to be 0 as we'll do attn masking anyway
|
||||
pad_token_id = (
|
||||
config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0
|
||||
)
|
||||
pad_size = (input_ids.shape[0], 32)
|
||||
padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id
|
||||
padded_input_ids = torch.cat((padding, input_ids), dim=1)
|
||||
padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1)
|
||||
model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature)
|
||||
next_logits_with_padding = model(
|
||||
**model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids
|
||||
).logits[:, -1, :]
|
||||
|
||||
# They should result in very similar logits
|
||||
self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5))
|
||||
|
||||
@unittest.skip(
|
||||
"InstructBLIPVideo cannot generate only from input ids, and requires pixel values in all cases to be present"
|
||||
)
|
||||
@parameterized.expand([("greedy", 1), ("beam search", 2)])
|
||||
def test_generate_from_inputs_embeds(self, _, num_beams):
|
||||
pass
|
||||
|
||||
@require_torch_sdpa
|
||||
def test_sdpa_can_dispatch_composite_models(self):
|
||||
"""
|
||||
@@ -643,7 +860,7 @@ class InstructBlipVideoModelIntegrationTest(unittest.TestCase):
|
||||
generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
|
||||
self.assertEqual(
|
||||
generated_text,
|
||||
"a baby girl wearing glasses is reading a book on the bed 1080p",
|
||||
"Explain what is happening in this short video. a baby girl wearing glasses is reading a book on the bed 1080p",
|
||||
)
|
||||
|
||||
def test_expansion_in_processing(self):
|
||||
|
||||
Reference in New Issue
Block a user