From 4cc0813e28c3ea1f0e1257d6079658c5f71d3dc6 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Fri, 1 Nov 2024 08:54:48 +0100 Subject: [PATCH] BLIP: enable generation tests (#34174) * blip2 tests * instructblips * copies * fix slow tests * fix * uncomment this * clean up after rebase * should be model main input * fix overwritten tests * oops len should be multiple of frame number * style * fix some tests --- .../models/blip_2/modeling_blip_2.py | 21 +- .../instructblip/modeling_instructblip.py | 25 +- .../modeling_instructblipvideo.py | 25 +- .../modular_instructblipvideo.py | 25 +- tests/generation/test_utils.py | 1 + tests/models/blip_2/test_modeling_blip_2.py | 220 ++++++++++++++++- .../test_modeling_instructblip.py | 223 ++++++++++++++++- .../test_modeling_instructblipvideo.py | 227 +++++++++++++++++- 8 files changed, 671 insertions(+), 96 deletions(-) diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index 4c06d85b50..08e42d1c8f 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -2342,24 +2342,11 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin): ) generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1] - outputs = self.language_model.generate( - inputs_embeds=inputs_embeds, - attention_mask=attention_mask, - **generate_kwargs, - ) - - # this is a temporary workaround to be consistent with other generation models and - # have BOS as the first token, even though under the hood we are calling LM with embeds + inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask} if not self.language_model.config.is_encoder_decoder: - bos_tokens = ( - torch.LongTensor([[self.config.text_config.bos_token_id]]) - .repeat(batch_size, 1) - .to(image_embeds.device) - ) - if not isinstance(outputs, torch.Tensor): - outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1) - else: - outputs = torch.cat([bos_tokens, outputs], dim=-1) + inputs["input_ids"] = input_ids + + outputs = self.language_model.generate(**inputs, **generate_kwargs) return outputs diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py index 5cce774ce0..a78a3b6687 100644 --- a/src/transformers/models/instructblip/modeling_instructblip.py +++ b/src/transformers/models/instructblip/modeling_instructblip.py @@ -1625,27 +1625,10 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel, Generati ) generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1] - outputs = self.language_model.generate( - inputs_embeds=inputs_embeds, - attention_mask=attention_mask, - **generate_kwargs, - ) - - # this is a temporary workaround to be consistent with other generation models and - # have BOS as the first token, even though under the hood we are calling LM with embeds + inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask} if not self.language_model.config.is_encoder_decoder: - # the InstructBLIP authors used inconsistent tokenizer/model files during training, - # with the tokenizer's bos token being set to which has ID=2, - # whereas the model's text config has bos token id = 0 - bos_token_id = ( - 2 - if self.config.text_config.architectures[0] == "LLaMAForCausalLM" - else self.config.text_config.bos_token_id - ) - bos_tokens = torch.LongTensor([[bos_token_id]]).repeat(batch_size, 1).to(image_embeds.device) - if not isinstance(outputs, torch.Tensor): - outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1) - else: - outputs = torch.cat([bos_tokens, outputs], dim=-1) + inputs["input_ids"] = input_ids + + outputs = self.language_model.generate(**inputs, **generate_kwargs) return outputs diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py index c9f1239166..90fc211397 100644 --- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py @@ -1660,27 +1660,10 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel ) generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1] - outputs = self.language_model.generate( - inputs_embeds=inputs_embeds, - attention_mask=attention_mask, - **generate_kwargs, - ) - - # this is a temporary workaround to be consistent with other generation models and - # have BOS as the first token, even though under the hood we are calling LM with embeds + inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask} if not self.language_model.config.is_encoder_decoder: - # the InstructBLIP authors used inconsistent tokenizer/model files during training, - # with the tokenizer's bos token being set to which has ID=2, - # whereas the model's text config has bos token id = 0 - bos_token_id = ( - 2 - if self.config.text_config.architectures[0] == "LLaMAForCausalLM" - else self.config.text_config.bos_token_id - ) - bos_tokens = torch.LongTensor([[bos_token_id]]).repeat(batch_size, 1).to(image_embeds.device) - if not isinstance(outputs, torch.Tensor): - outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1) - else: - outputs = torch.cat([bos_tokens, outputs], dim=-1) + inputs["input_ids"] = input_ids + + outputs = self.language_model.generate(**inputs, **generate_kwargs) return outputs diff --git a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py index 2128f25df6..63c6c48685 100644 --- a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py @@ -468,27 +468,10 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipForConditionalGenera ) generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1] - outputs = self.language_model.generate( - inputs_embeds=inputs_embeds, - attention_mask=attention_mask, - **generate_kwargs, - ) - - # this is a temporary workaround to be consistent with other generation models and - # have BOS as the first token, even though under the hood we are calling LM with embeds + inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask} if not self.language_model.config.is_encoder_decoder: - # the InstructBLIP authors used inconsistent tokenizer/model files during training, - # with the tokenizer's bos token being set to which has ID=2, - # whereas the model's text config has bos token id = 0 - bos_token_id = ( - 2 - if self.config.text_config.architectures[0] == "LLaMAForCausalLM" - else self.config.text_config.bos_token_id - ) - bos_tokens = torch.LongTensor([[bos_token_id]]).repeat(batch_size, 1).to(image_embeds.device) - if not isinstance(outputs, torch.Tensor): - outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1) - else: - outputs = torch.cat([bos_tokens, outputs], dim=-1) + inputs["input_ids"] = input_ids + + outputs = self.language_model.generate(**inputs, **generate_kwargs) return outputs diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index cf10ff1b92..3bd8ce4b59 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -96,6 +96,7 @@ if is_torch_available(): class GenerationTesterMixin: + input_name = "input_ids" model_tester = None all_generative_model_classes = () max_new_tokens = 3 diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index d91adf1bd4..1ec9c2e1c0 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -20,7 +20,9 @@ import tempfile import unittest import numpy as np +import pytest import requests +from parameterized import parameterized from transformers import CONFIG_MAPPING, Blip2Config, Blip2QFormerConfig, Blip2VisionConfig from transformers.testing_utils import ( @@ -392,7 +394,14 @@ class Blip2TextModelDecoderOnlyTester: # this model tester uses a decoder-only language model (OPT) class Blip2ForConditionalGenerationDecoderOnlyModelTester: def __init__( - self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10 + self, + parent, + vision_kwargs=None, + qformer_kwargs=None, + text_kwargs=None, + is_training=True, + num_query_tokens=10, + image_token_index=4, ): if vision_kwargs is None: vision_kwargs = {} @@ -406,14 +415,24 @@ class Blip2ForConditionalGenerationDecoderOnlyModelTester: self.qformer_model_tester = Blip2QFormerModelTester(parent, **qformer_kwargs) self.text_model_tester = Blip2TextModelDecoderOnlyTester(parent, **text_kwargs) self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test - self.seq_length = self.text_model_tester.seq_length # need seq_length for common tests + self.seq_length = self.text_model_tester.seq_length + num_query_tokens # need seq_length for common tests self.is_training = is_training self.num_query_tokens = num_query_tokens + self.image_token_index = image_token_index def prepare_config_and_inputs(self): _, pixel_values = self.vision_model_tester.prepare_config_and_inputs() _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() + vision_tokens = ( + torch.ones((input_ids.shape[0], self.num_query_tokens), device=torch_device, dtype=input_ids.dtype) + * self.image_token_index + ) + input_ids[input_ids == self.image_token_index] = self.text_model_tester.pad_token_id + input_ids = torch.cat([vision_tokens, input_ids], dim=-1) + vision_attention_mask = torch.ones_like(vision_tokens) + attention_mask = torch.cat([vision_attention_mask, attention_mask], dim=-1) + config = self.get_config() return config, input_ids, attention_mask, pixel_values @@ -424,6 +443,7 @@ class Blip2ForConditionalGenerationDecoderOnlyModelTester: qformer_config=self.qformer_model_tester.get_config(), text_config=self.text_model_tester.get_config(), num_query_tokens=self.num_query_tokens, + image_token_index=self.image_token_index, ) def create_and_check_for_conditional_generation(self, config, input_ids, attention_mask, pixel_values): @@ -451,6 +471,7 @@ class Blip2ForConditionalGenerationDecoderOnlyModelTester: @require_torch class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): all_model_classes = (Blip2ForConditionalGeneration,) if is_torch_available() else () + all_generative_model_classes = (Blip2ForConditionalGeneration,) if is_torch_available() else () fx_compatible = False test_head_masking = False test_pruning = False @@ -693,6 +714,192 @@ class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationT model = Blip2ForConditionalGeneration.from_pretrained(model_name) self.assertIsNotNone(model) + # overwrite because BLIP internally calls LM.generate() with embeds thus it cannot operate in no cache format + def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1): + use_cache = True # force this to be True in case False is passed + + input_batch_size = int(output.sequences.shape[0] / num_return_sequences) + internal_batch_size = ( + input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences + ) + + seq_length = getattr(self.model_tester, "seq_length", None) + seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length) + seq_length = getattr(self.model_tester, "text_seq_length", seq_length) + + config = config.text_config if hasattr(config, "text_config") else config + + gen_len = ( + output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length + ) + + # in some models we subsample the sequence length in inner layers + if hasattr(self.model_tester, "get_subsampled_output_lengths"): + seq_length = self.model_tester.get_subsampled_output_lengths(seq_length) + + # scores + self._check_scores(internal_batch_size, output.scores, length=gen_len, config=config) + + # unprocessed logits + self._check_logits(internal_batch_size, output.logits, config=config) + + # Attentions + if self.has_attentions: + if config.is_encoder_decoder: + # encoder + self._check_encoder_attention_for_generate( + output.encoder_attentions, input_batch_size, config, seq_length + ) + # decoder + self._check_attentions_for_generate( + internal_batch_size, + output.decoder_attentions, + min_length=1, + max_length=output.sequences.shape[-1], + config=config, + use_cache=use_cache, + ) + else: + # if use_cache first input is equal to no use_cache, so skip here + attentions = output.attentions if not use_cache else output.attentions[1:] + min_length = seq_length if not use_cache else seq_length + 1 + self._check_attentions_for_generate( + internal_batch_size, + attentions=attentions, + min_length=min_length, + max_length=output.sequences.shape[-1], + config=config, + use_cache=use_cache, + ) + + # Hidden States + if config.is_encoder_decoder: + # encoder + self._check_encoder_hidden_states_for_generate( + output.encoder_hidden_states, input_batch_size, config, seq_length + ) + + # decoder + self._check_hidden_states_for_generate( + internal_batch_size, + output.decoder_hidden_states, + min_length=1, + max_length=output.sequences.shape[-1], + config=config, + use_cache=use_cache, + ) + else: + # if use_cache first input is equal to no use_cache, so skip here + hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:] + min_length = seq_length if not use_cache else seq_length + 1 + self._check_hidden_states_for_generate( + internal_batch_size, + hidden_states, + min_length=min_length, + max_length=output.sequences.shape[-1], + config=config, + use_cache=use_cache, + ) + + # Past Key Value States + if use_cache: + past_key_values = output.past_key_values + past_sequence_length = output.sequences.shape[-1] - 1 + self._check_past_key_values_for_generate( + internal_batch_size, + past_key_values, + seq_length=past_sequence_length, + config=config, + ) + + # overwrite because BLIP2 cannot generate only from input ids, and requires pixel values in all cases to be present + @pytest.mark.generate + def test_left_padding_compatibility(self): + # NOTE: left-padding results in small numerical differences. This is expected. + # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 + + # First, filter out models that don't support left padding + # - The model must have generative capabilities + if len(self.all_generative_model_classes) == 0: + self.skipTest(reason="No generative architecture available for this model.") + + # - The model must support padding + if not self.has_attentions: + self.skipTest(reason="This model doesn't support padding.") + + # - The model must be a decoder-only architecture (encoder-based architectures use right-padding) + decoder_only_classes = [] + for model_class in self.all_generative_model_classes: + config, _ = self.prepare_config_and_inputs_for_generate() + if config.is_encoder_decoder: + continue + else: + decoder_only_classes.append(model_class) + if len(decoder_only_classes) == 0: + self.skipTest(reason="No decoder-only architecture available for this model.") + + # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't + # added support for it yet. We skip these models for now. + has_encoder_attributes = any( + attr_name + for attr_name in config.to_dict().keys() + if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size" + ) + if has_encoder_attributes: + self.skipTest( + reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding." + ) + + # Then, test left-padding + def _prepare_model_kwargs(input_ids, attention_mask, signature): + model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask} + if "position_ids" in signature: + position_ids = torch.cumsum(attention_mask, dim=-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + model_kwargs["position_ids"] = position_ids + if "cache_position" in signature: + cache_position = torch.arange(input_ids.shape[-1], device=torch_device) + model_kwargs["cache_position"] = cache_position + return model_kwargs + + for model_class in decoder_only_classes: + config, inputs_dict = self.prepare_config_and_inputs_for_generate() + input_ids = inputs_dict["input_ids"] + attention_mask = inputs_dict.get("attention_mask") + pixel_values = inputs_dict["pixel_values"] + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + + model = model_class(config).to(torch_device).eval() + signature = inspect.signature(model.forward).parameters.keys() + + # no cache as some models require special cache classes to be init outside forward + model.generation_config.use_cache = False + + # Without padding + model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature) + next_logits_wo_padding = model(**model_kwargs, pixel_values=pixel_values).logits[:, -1, :] + + # With left-padding (length 32) + # can hardcode pad_token to be 0 as we'll do attn masking anyway + pad_token_id = ( + config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0 + ) + pad_size = (input_ids.shape[0], 32) + padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id + padded_input_ids = torch.cat((padding, input_ids), dim=1) + padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1) + model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature) + next_logits_with_padding = model(**model_kwargs, pixel_values=pixel_values).logits[:, -1, :] + + # They should result in very similar logits + self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5)) + + @unittest.skip("BLIP2 cannot generate only from input ids, and requires pixel values in all cases to be present") + @parameterized.expand([("greedy", 1), ("beam search", 2)]) + def test_generate_from_inputs_embeds(self, _, num_beams): + pass + # this class is based on `T5ModelTester` found in tests/models/t5/test_modeling_t5.py class Blip2TextModelTester: @@ -1780,6 +1987,7 @@ class Blip2ModelIntegrationTest(unittest.TestCase): generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip() # Test output + print(predictions[0].tolist(), generated_text) self.assertEqual(predictions[0].tolist(), [2, 102, 693, 2828, 15, 5, 4105, 19, 10, 2335, 50118]) self.assertEqual("a woman sitting on the beach with a dog", generated_text) @@ -1794,9 +2002,9 @@ class Blip2ModelIntegrationTest(unittest.TestCase): # Test output self.assertEqual( predictions[0].tolist(), - [2, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118], + [2, 45641, 35, 61, 343, 16, 42, 116, 31652, 35, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118], ) - self.assertEqual(generated_text, "it's not a city, it's a beach") + self.assertEqual(generated_text, "Question: which city is this? Answer: it's not a city, it's a beach") def test_inference_interpolate_pos_encoding(self): processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b") @@ -1905,9 +2113,9 @@ class Blip2ModelIntegrationTest(unittest.TestCase): # Test output self.assertEqual( predictions[0].tolist(), - [2, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118], + [2, 45641, 35, 61, 343, 16, 42, 116, 31652, 35, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118], ) - self.assertEqual(generated_text, "it's not a city, it's a beach") + self.assertEqual(generated_text, "Question: which city is this? Answer: it's not a city, it's a beach") @require_torch_multi_accelerator def test_inference_t5_multi_accelerator(self): diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py index a9dba06dab..f06caeb037 100644 --- a/tests/models/instructblip/test_modeling_instructblip.py +++ b/tests/models/instructblip/test_modeling_instructblip.py @@ -19,7 +19,9 @@ import tempfile import unittest import numpy as np +import pytest import requests +from parameterized import parameterized from transformers import ( CONFIG_MAPPING, @@ -320,7 +322,7 @@ class InstructBlipTextModelDecoderOnlyTester: hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, - max_position_embeddings=20, + max_position_embeddings=100, eos_token_id=2, pad_token_id=1, bos_token_id=0, @@ -384,7 +386,14 @@ class InstructBlipTextModelDecoderOnlyTester: # this model tester uses a decoder-only language model (OPT) class InstructBlipForConditionalGenerationDecoderOnlyModelTester: def __init__( - self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10 + self, + parent, + vision_kwargs=None, + qformer_kwargs=None, + text_kwargs=None, + is_training=True, + num_query_tokens=10, + image_token_index=4, ): if vision_kwargs is None: vision_kwargs = {} @@ -398,9 +407,10 @@ class InstructBlipForConditionalGenerationDecoderOnlyModelTester: self.qformer_model_tester = InstructBlipQFormerModelTester(parent, **qformer_kwargs) self.text_model_tester = InstructBlipTextModelDecoderOnlyTester(parent, **text_kwargs) self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test - self.seq_length = self.text_model_tester.seq_length # need seq_length for common tests + self.seq_length = self.text_model_tester.seq_length + num_query_tokens # need seq_length for common tests self.is_training = is_training self.num_query_tokens = num_query_tokens + self.image_token_index = image_token_index def prepare_config_and_inputs(self): _, pixel_values = self.vision_model_tester.prepare_config_and_inputs() @@ -408,6 +418,14 @@ class InstructBlipForConditionalGenerationDecoderOnlyModelTester: _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() config = self.get_config() + vision_tokens = ( + torch.ones((input_ids.shape[0], self.num_query_tokens), device=torch_device, dtype=input_ids.dtype) + * self.image_token_index + ) + input_ids[input_ids == self.image_token_index] = self.text_model_tester.pad_token_id + input_ids = torch.cat([vision_tokens, input_ids], dim=-1) + vision_attention_mask = torch.ones_like(vision_tokens) + attention_mask = torch.cat([vision_attention_mask, attention_mask], dim=-1) return config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values @@ -417,6 +435,7 @@ class InstructBlipForConditionalGenerationDecoderOnlyModelTester: qformer_config=self.qformer_model_tester.get_config(), text_config=self.text_model_tester.get_config(), num_query_tokens=self.num_query_tokens, + image_token_index=self.image_token_index, ) def create_and_check_for_conditional_generation( @@ -455,6 +474,7 @@ class InstructBlipForConditionalGenerationDecoderOnlyModelTester: @require_torch class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): all_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else () + all_generative_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else () pipeline_model_mapping = {"image-text-to-text": InstructBlipForConditionalGeneration} fx_compatible = False test_head_masking = False @@ -532,6 +552,199 @@ class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, Gene model = InstructBlipForConditionalGeneration.from_pretrained(model_name) self.assertIsNotNone(model) + # overwrite because InstructBLIP internally calls LM.generate() with embeds thus it cannot operate in no cache format + def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1): + use_cache = True # force this to be True in case False is passed + + input_batch_size = int(output.sequences.shape[0] / num_return_sequences) + internal_batch_size = ( + input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences + ) + + seq_length = getattr(self.model_tester, "seq_length", None) + seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length) + seq_length = getattr(self.model_tester, "text_seq_length", seq_length) + + config = config.text_config if hasattr(config, "text_config") else config + + gen_len = ( + output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length + ) + + # in some models we subsample the sequence length in inner layers + if hasattr(self.model_tester, "get_subsampled_output_lengths"): + seq_length = self.model_tester.get_subsampled_output_lengths(seq_length) + + # scores + self._check_scores(internal_batch_size, output.scores, length=gen_len, config=config) + + # unprocessed logits + self._check_logits(internal_batch_size, output.logits, config=config) + + # Attentions + if self.has_attentions: + if config.is_encoder_decoder: + # encoder + self._check_encoder_attention_for_generate( + output.encoder_attentions, input_batch_size, config, seq_length + ) + # decoder + self._check_attentions_for_generate( + internal_batch_size, + output.decoder_attentions, + min_length=1, + max_length=output.sequences.shape[-1], + config=config, + use_cache=use_cache, + ) + else: + # if use_cache first input is equal to no use_cache, so skip here + attentions = output.attentions if not use_cache else output.attentions[1:] + min_length = seq_length if not use_cache else seq_length + 1 + self._check_attentions_for_generate( + internal_batch_size, + attentions=attentions, + min_length=min_length, + max_length=output.sequences.shape[-1], + config=config, + use_cache=use_cache, + ) + + # Hidden States + if config.is_encoder_decoder: + # encoder + self._check_encoder_hidden_states_for_generate( + output.encoder_hidden_states, input_batch_size, config, seq_length + ) + + # decoder + self._check_hidden_states_for_generate( + internal_batch_size, + output.decoder_hidden_states, + min_length=1, + max_length=output.sequences.shape[-1], + config=config, + use_cache=use_cache, + ) + else: + # if use_cache first input is equal to no use_cache, so skip here + hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:] + min_length = seq_length if not use_cache else seq_length + 1 + self._check_hidden_states_for_generate( + internal_batch_size, + hidden_states, + min_length=min_length, + max_length=output.sequences.shape[-1], + config=config, + use_cache=use_cache, + ) + + # Past Key Value States + if use_cache: + past_key_values = output.past_key_values + past_sequence_length = output.sequences.shape[-1] - 1 + self._check_past_key_values_for_generate( + internal_batch_size, + past_key_values, + seq_length=past_sequence_length, + config=config, + ) + + # overwrite because InstructBLIP cannot generate only from input ids, and requires `pixel` values and `qformer_input_ids` in all cases to be present + @pytest.mark.generate + def test_left_padding_compatibility(self): + # NOTE: left-padding results in small numerical differences. This is expected. + # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 + + # First, filter out models that don't support left padding + # - The model must have generative capabilities + if len(self.all_generative_model_classes) == 0: + self.skipTest(reason="No generative architecture available for this model.") + + # - The model must support padding + if not self.has_attentions: + self.skipTest(reason="This model doesn't support padding.") + + # - The model must be a decoder-only architecture (encoder-based architectures use right-padding) + decoder_only_classes = [] + for model_class in self.all_generative_model_classes: + config, _ = self.prepare_config_and_inputs_for_generate() + if config.is_encoder_decoder: + continue + else: + decoder_only_classes.append(model_class) + if len(decoder_only_classes) == 0: + self.skipTest(reason="No decoder-only architecture available for this model.") + + # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't + # added support for it yet. We skip these models for now. + has_encoder_attributes = any( + attr_name + for attr_name in config.to_dict().keys() + if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size" + ) + if has_encoder_attributes: + self.skipTest( + reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding." + ) + + # Then, test left-padding + def _prepare_model_kwargs(input_ids, attention_mask, signature): + model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask} + if "position_ids" in signature: + position_ids = torch.cumsum(attention_mask, dim=-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + model_kwargs["position_ids"] = position_ids + if "cache_position" in signature: + cache_position = torch.arange(input_ids.shape[-1], device=torch_device) + model_kwargs["cache_position"] = cache_position + return model_kwargs + + for model_class in decoder_only_classes: + config, inputs_dict = self.prepare_config_and_inputs_for_generate() + input_ids = inputs_dict["input_ids"] + attention_mask = inputs_dict.get("attention_mask") + pixel_values = inputs_dict["pixel_values"] + qformer_input_ids = inputs_dict["qformer_input_ids"] + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + + model = model_class(config).to(torch_device).eval() + signature = inspect.signature(model.forward).parameters.keys() + + # no cache as some models require special cache classes to be init outside forward + model.generation_config.use_cache = False + + # Without padding + model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature) + next_logits_wo_padding = model( + **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids + ).logits[:, -1, :] + + # With left-padding (length 32) + # can hardcode pad_token to be 0 as we'll do attn masking anyway + pad_token_id = ( + config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0 + ) + pad_size = (input_ids.shape[0], 32) + padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id + padded_input_ids = torch.cat((padding, input_ids), dim=1) + padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1) + model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature) + next_logits_with_padding = model( + **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids + ).logits[:, -1, :] + + # They should result in very similar logits + self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5)) + + @unittest.skip( + "InstructBLIP cannot generate only from input ids, and requires pixel values in all cases to be present" + ) + @parameterized.expand([("greedy", 1), ("beam search", 2)]) + def test_generate_from_inputs_embeds(self, _, num_beams): + pass + @require_torch_sdpa def test_sdpa_can_dispatch_composite_models(self): """ @@ -632,12 +845,12 @@ class InstructBlipModelIntegrationTest(unittest.TestCase): outputs = model.generate(**inputs, max_new_tokens=30) generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip() - expected_outputs = [2, 450, 22910, 9565, 310, 445, 1967, 338, 393, 263, 767, 338, 13977, 292, 22095, 373, 278, 1250, 310, 263, 13328, 20134, 29963, 1550, 372, 338, 19500, 1623, 263, 19587, 4272] # fmt: off + expected_outputs = [2, 1724, 338, 22910, 1048, 445, 1967, 29973, 450, 22910, 9565, 310, 445, 1967, 338, 393, 263, 767, 338, 13977, 292, 22095, 373, 278, 1250, 310, 263, 13328, 20134, 29963, 1550, 19500, 373, 263, 19587, 4272, 11952, 29889] # fmt: off self.assertEqual(outputs[0].tolist(), expected_outputs) self.assertEqual( generated_text, - "The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV while it is driving down a busy city", + "What is unusual about this image? The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV while driving on a busy city street.", ) def test_inference_flant5_xl(self): diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py index ce25571d29..7e0bf4eaf0 100644 --- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py +++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py @@ -19,7 +19,9 @@ import tempfile import unittest import numpy as np +import pytest from huggingface_hub import hf_hub_download +from parameterized import parameterized from transformers import ( CONFIG_MAPPING, @@ -398,7 +400,14 @@ class InstructBlipVideoTextModelDecoderOnlyTester: # this model tester uses a decoder-only language model (OPT) class InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester: def __init__( - self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10 + self, + parent, + vision_kwargs=None, + qformer_kwargs=None, + text_kwargs=None, + is_training=True, + num_query_tokens=10, + video_token_index=4, ): if vision_kwargs is None: vision_kwargs = {} @@ -412,17 +421,30 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester: self.qformer_model_tester = InstructBlipVideoQFormerModelTester(parent, **qformer_kwargs) self.text_model_tester = InstructBlipVideoTextModelDecoderOnlyTester(parent, **text_kwargs) self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test - self.seq_length = self.text_model_tester.seq_length # need seq_length for common tests + self.frames = self.vision_model_tester.frames + # need seq_length for common tests + self.seq_length = self.text_model_tester.seq_length + (num_query_tokens * self.frames) self.is_training = is_training self.num_query_tokens = num_query_tokens + self.video_token_index = video_token_index def prepare_config_and_inputs(self): _, pixel_values = self.vision_model_tester.prepare_config_and_inputs() _, _, _, qformer_input_ids, qformer_attention_mask = self.qformer_model_tester.prepare_config_and_inputs() _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() - frames = self.vision_model_tester.frames _, c, h, w = pixel_values.shape - pixel_values = pixel_values.reshape(-1, frames, c, h, w) + pixel_values = pixel_values.reshape(-1, self.frames, c, h, w) + + vision_tokens = ( + torch.ones( + (input_ids.shape[0], self.num_query_tokens * self.frames), device=torch_device, dtype=input_ids.dtype + ) + * self.video_token_index + ) + input_ids[input_ids == self.video_token_index] = self.text_model_tester.pad_token_id + input_ids = torch.cat([vision_tokens, input_ids], dim=-1) + vision_attention_mask = torch.ones_like(vision_tokens) + attention_mask = torch.cat([vision_attention_mask, attention_mask], dim=-1) config = self.get_config() @@ -434,6 +456,7 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester: qformer_config=self.qformer_model_tester.get_config(), text_config=self.text_model_tester.get_config(), num_query_tokens=self.num_query_tokens, + video_token_index=self.video_token_index, ) def create_and_check_for_conditional_generation( @@ -476,6 +499,7 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyTest( ModelTesterMixin, GenerationTesterMixin, unittest.TestCase ): all_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else () + all_generative_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else () fx_compatible = False test_head_masking = False test_pruning = False @@ -552,6 +576,199 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyTest( model = InstructBlipVideoForConditionalGeneration.from_pretrained(model_name) self.assertIsNotNone(model) + # overwrite because InstructBLIPVideo internally calls LM.generate() with embeds thus it cannot operate in no cache format + def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1): + use_cache = True # force this to be True in case False is passed + + input_batch_size = int(output.sequences.shape[0] / num_return_sequences) + internal_batch_size = ( + input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences + ) + + seq_length = getattr(self.model_tester, "seq_length", None) + seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length) + seq_length = getattr(self.model_tester, "text_seq_length", seq_length) + + config = config.text_config if hasattr(config, "text_config") else config + + gen_len = ( + output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length + ) + + # in some models we subsample the sequence length in inner layers + if hasattr(self.model_tester, "get_subsampled_output_lengths"): + seq_length = self.model_tester.get_subsampled_output_lengths(seq_length) + + # scores + self._check_scores(internal_batch_size, output.scores, length=gen_len, config=config) + + # unprocessed logits + self._check_logits(internal_batch_size, output.logits, config=config) + + # Attentions + if self.has_attentions: + if config.is_encoder_decoder: + # encoder + self._check_encoder_attention_for_generate( + output.encoder_attentions, input_batch_size, config, seq_length + ) + # decoder + self._check_attentions_for_generate( + internal_batch_size, + output.decoder_attentions, + min_length=1, + max_length=output.sequences.shape[-1], + config=config, + use_cache=use_cache, + ) + else: + # if use_cache first input is equal to no use_cache, so skip here + attentions = output.attentions if not use_cache else output.attentions[1:] + min_length = seq_length if not use_cache else seq_length + 1 + self._check_attentions_for_generate( + internal_batch_size, + attentions=attentions, + min_length=min_length, + max_length=output.sequences.shape[-1], + config=config, + use_cache=use_cache, + ) + + # Hidden States + if config.is_encoder_decoder: + # encoder + self._check_encoder_hidden_states_for_generate( + output.encoder_hidden_states, input_batch_size, config, seq_length + ) + + # decoder + self._check_hidden_states_for_generate( + internal_batch_size, + output.decoder_hidden_states, + min_length=1, + max_length=output.sequences.shape[-1], + config=config, + use_cache=use_cache, + ) + else: + # if use_cache first input is equal to no use_cache, so skip here + hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:] + min_length = seq_length if not use_cache else seq_length + 1 + self._check_hidden_states_for_generate( + internal_batch_size, + hidden_states, + min_length=min_length, + max_length=output.sequences.shape[-1], + config=config, + use_cache=use_cache, + ) + + # Past Key Value States + if use_cache: + past_key_values = output.past_key_values + past_sequence_length = output.sequences.shape[-1] - 1 + self._check_past_key_values_for_generate( + internal_batch_size, + past_key_values, + seq_length=past_sequence_length, + config=config, + ) + + # overwrite because InstructBLIPVideo cannot generate only from input ids, and requires `pixel` values and `qformer_input_ids` in all cases to be present + @pytest.mark.generate + def test_left_padding_compatibility(self): + # NOTE: left-padding results in small numerical differences. This is expected. + # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 + + # First, filter out models that don't support left padding + # - The model must have generative capabilities + if len(self.all_generative_model_classes) == 0: + self.skipTest(reason="No generative architecture available for this model.") + + # - The model must support padding + if not self.has_attentions: + self.skipTest(reason="This model doesn't support padding.") + + # - The model must be a decoder-only architecture (encoder-based architectures use right-padding) + decoder_only_classes = [] + for model_class in self.all_generative_model_classes: + config, _ = self.prepare_config_and_inputs_for_generate() + if config.is_encoder_decoder: + continue + else: + decoder_only_classes.append(model_class) + if len(decoder_only_classes) == 0: + self.skipTest(reason="No decoder-only architecture available for this model.") + + # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't + # added support for it yet. We skip these models for now. + has_encoder_attributes = any( + attr_name + for attr_name in config.to_dict().keys() + if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size" + ) + if has_encoder_attributes: + self.skipTest( + reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding." + ) + + # Then, test left-padding + def _prepare_model_kwargs(input_ids, attention_mask, signature): + model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask} + if "position_ids" in signature: + position_ids = torch.cumsum(attention_mask, dim=-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + model_kwargs["position_ids"] = position_ids + if "cache_position" in signature: + cache_position = torch.arange(input_ids.shape[-1], device=torch_device) + model_kwargs["cache_position"] = cache_position + return model_kwargs + + for model_class in decoder_only_classes: + config, inputs_dict = self.prepare_config_and_inputs_for_generate() + input_ids = inputs_dict["input_ids"] + attention_mask = inputs_dict.get("attention_mask") + pixel_values = inputs_dict["pixel_values"] + qformer_input_ids = inputs_dict["qformer_input_ids"] + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + + model = model_class(config).to(torch_device).eval() + signature = inspect.signature(model.forward).parameters.keys() + + # no cache as some models require special cache classes to be init outside forward + model.generation_config.use_cache = False + + # Without padding + model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature) + next_logits_wo_padding = model( + **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids + ).logits[:, -1, :] + + # With left-padding (length 32) + # can hardcode pad_token to be 0 as we'll do attn masking anyway + pad_token_id = ( + config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0 + ) + pad_size = (input_ids.shape[0], 32) + padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id + padded_input_ids = torch.cat((padding, input_ids), dim=1) + padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1) + model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature) + next_logits_with_padding = model( + **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids + ).logits[:, -1, :] + + # They should result in very similar logits + self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5)) + + @unittest.skip( + "InstructBLIPVideo cannot generate only from input ids, and requires pixel values in all cases to be present" + ) + @parameterized.expand([("greedy", 1), ("beam search", 2)]) + def test_generate_from_inputs_embeds(self, _, num_beams): + pass + @require_torch_sdpa def test_sdpa_can_dispatch_composite_models(self): """ @@ -643,7 +860,7 @@ class InstructBlipVideoModelIntegrationTest(unittest.TestCase): generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip() self.assertEqual( generated_text, - "a baby girl wearing glasses is reading a book on the bed 1080p", + "Explain what is happening in this short video. a baby girl wearing glasses is reading a book on the bed 1080p", ) def test_expansion_in_processing(self):