BLIP: enable generation tests (#34174)

* blip2 tests * instructblips * copies * fix slow tests * fix * uncomment this * clean up after rebase * should be model main input * fix overwritten tests * oops len should be multiple of frame number * style * fix some tests
2024-11-01 08:54:48 +01:00
parent 6beb3f1691
commit 4cc0813e28
8 changed files with 671 additions and 96 deletions
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -20,7 +20,9 @@ import tempfile
 import unittest

 import numpy as np
+import pytest
 import requests
+from parameterized import parameterized

 from transformers import CONFIG_MAPPING, Blip2Config, Blip2QFormerConfig, Blip2VisionConfig
 from transformers.testing_utils import (
@@ -392,7 +394,14 @@ class Blip2TextModelDecoderOnlyTester:
 # this model tester uses a decoder-only language model (OPT)
 class Blip2ForConditionalGenerationDecoderOnlyModelTester:
    def __init__(
-        self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10
+        self,
+        parent,
+        vision_kwargs=None,
+        qformer_kwargs=None,
+        text_kwargs=None,
+        is_training=True,
+        num_query_tokens=10,
+        image_token_index=4,
    ):
        if vision_kwargs is None:
            vision_kwargs = {}
@@ -406,14 +415,24 @@ class Blip2ForConditionalGenerationDecoderOnlyModelTester:
        self.qformer_model_tester = Blip2QFormerModelTester(parent, **qformer_kwargs)
        self.text_model_tester = Blip2TextModelDecoderOnlyTester(parent, **text_kwargs)
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.seq_length = self.text_model_tester.seq_length  # need seq_length for common tests
+        self.seq_length = self.text_model_tester.seq_length + num_query_tokens  # need seq_length for common tests
        self.is_training = is_training
        self.num_query_tokens = num_query_tokens
+        self.image_token_index = image_token_index

    def prepare_config_and_inputs(self):
        _, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
        _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()

+        vision_tokens = (
+            torch.ones((input_ids.shape[0], self.num_query_tokens), device=torch_device, dtype=input_ids.dtype)
+            * self.image_token_index
+        )
+        input_ids[input_ids == self.image_token_index] = self.text_model_tester.pad_token_id
+        input_ids = torch.cat([vision_tokens, input_ids], dim=-1)
+        vision_attention_mask = torch.ones_like(vision_tokens)
+        attention_mask = torch.cat([vision_attention_mask, attention_mask], dim=-1)
+
        config = self.get_config()

        return config, input_ids, attention_mask, pixel_values
@@ -424,6 +443,7 @@ class Blip2ForConditionalGenerationDecoderOnlyModelTester:
            qformer_config=self.qformer_model_tester.get_config(),
            text_config=self.text_model_tester.get_config(),
            num_query_tokens=self.num_query_tokens,
+            image_token_index=self.image_token_index,
        )

    def create_and_check_for_conditional_generation(self, config, input_ids, attention_mask, pixel_values):
@@ -451,6 +471,7 @@ class Blip2ForConditionalGenerationDecoderOnlyModelTester:
@require_torch
 class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
    all_model_classes = (Blip2ForConditionalGeneration,) if is_torch_available() else ()
+    all_generative_model_classes = (Blip2ForConditionalGeneration,) if is_torch_available() else ()
    fx_compatible = False
    test_head_masking = False
    test_pruning = False
@@ -693,6 +714,192 @@ class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationT
        model = Blip2ForConditionalGeneration.from_pretrained(model_name)
        self.assertIsNotNone(model)

+    # overwrite because BLIP internally calls LM.generate() with embeds thus it cannot operate in no cache format
+    def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1):
+        use_cache = True  # force this to be True in case False is passed
+
+        input_batch_size = int(output.sequences.shape[0] / num_return_sequences)
+        internal_batch_size = (
+            input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences
+        )
+
+        seq_length = getattr(self.model_tester, "seq_length", None)
+        seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length)
+        seq_length = getattr(self.model_tester, "text_seq_length", seq_length)
+
+        config = config.text_config if hasattr(config, "text_config") else config
+
+        gen_len = (
+            output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
+        )
+
+        # in some models we subsample the sequence length in inner layers
+        if hasattr(self.model_tester, "get_subsampled_output_lengths"):
+            seq_length = self.model_tester.get_subsampled_output_lengths(seq_length)
+
+        # scores
+        self._check_scores(internal_batch_size, output.scores, length=gen_len, config=config)
+
+        # unprocessed logits
+        self._check_logits(internal_batch_size, output.logits, config=config)
+
+        # Attentions
+        if self.has_attentions:
+            if config.is_encoder_decoder:
+                # encoder
+                self._check_encoder_attention_for_generate(
+                    output.encoder_attentions, input_batch_size, config, seq_length
+                )
+                # decoder
+                self._check_attentions_for_generate(
+                    internal_batch_size,
+                    output.decoder_attentions,
+                    min_length=1,
+                    max_length=output.sequences.shape[-1],
+                    config=config,
+                    use_cache=use_cache,
+                )
+            else:
+                # if use_cache first input is equal to no use_cache, so skip here
+                attentions = output.attentions if not use_cache else output.attentions[1:]
+                min_length = seq_length if not use_cache else seq_length + 1
+                self._check_attentions_for_generate(
+                    internal_batch_size,
+                    attentions=attentions,
+                    min_length=min_length,
+                    max_length=output.sequences.shape[-1],
+                    config=config,
+                    use_cache=use_cache,
+                )
+
+        # Hidden States
+        if config.is_encoder_decoder:
+            # encoder
+            self._check_encoder_hidden_states_for_generate(
+                output.encoder_hidden_states, input_batch_size, config, seq_length
+            )
+
+            # decoder
+            self._check_hidden_states_for_generate(
+                internal_batch_size,
+                output.decoder_hidden_states,
+                min_length=1,
+                max_length=output.sequences.shape[-1],
+                config=config,
+                use_cache=use_cache,
+            )
+        else:
+            # if use_cache first input is equal to no use_cache, so skip here
+            hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:]
+            min_length = seq_length if not use_cache else seq_length + 1
+            self._check_hidden_states_for_generate(
+                internal_batch_size,
+                hidden_states,
+                min_length=min_length,
+                max_length=output.sequences.shape[-1],
+                config=config,
+                use_cache=use_cache,
+            )
+
+        # Past Key Value States
+        if use_cache:
+            past_key_values = output.past_key_values
+            past_sequence_length = output.sequences.shape[-1] - 1
+            self._check_past_key_values_for_generate(
+                internal_batch_size,
+                past_key_values,
+                seq_length=past_sequence_length,
+                config=config,
+            )
+
+    # overwrite because BLIP2 cannot generate only from input ids, and requires pixel values in all cases to be present
+    @pytest.mark.generate
+    def test_left_padding_compatibility(self):
+        # NOTE: left-padding results in small numerical differences. This is expected.
+        # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535
+
+        # First, filter out models that don't support left padding
+        # - The model must have generative capabilities
+        if len(self.all_generative_model_classes) == 0:
+            self.skipTest(reason="No generative architecture available for this model.")
+
+        # - The model must support padding
+        if not self.has_attentions:
+            self.skipTest(reason="This model doesn't support padding.")
+
+        # - The model must be a decoder-only architecture (encoder-based architectures use right-padding)
+        decoder_only_classes = []
+        for model_class in self.all_generative_model_classes:
+            config, _ = self.prepare_config_and_inputs_for_generate()
+            if config.is_encoder_decoder:
+                continue
+            else:
+                decoder_only_classes.append(model_class)
+        if len(decoder_only_classes) == 0:
+            self.skipTest(reason="No decoder-only architecture available for this model.")
+
+        # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't
+        #   added support for it yet. We skip these models for now.
+        has_encoder_attributes = any(
+            attr_name
+            for attr_name in config.to_dict().keys()
+            if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size"
+        )
+        if has_encoder_attributes:
+            self.skipTest(
+                reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding."
+            )
+
+        # Then, test left-padding
+        def _prepare_model_kwargs(input_ids, attention_mask, signature):
+            model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask}
+            if "position_ids" in signature:
+                position_ids = torch.cumsum(attention_mask, dim=-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                model_kwargs["position_ids"] = position_ids
+            if "cache_position" in signature:
+                cache_position = torch.arange(input_ids.shape[-1], device=torch_device)
+                model_kwargs["cache_position"] = cache_position
+            return model_kwargs
+
+        for model_class in decoder_only_classes:
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            input_ids = inputs_dict["input_ids"]
+            attention_mask = inputs_dict.get("attention_mask")
+            pixel_values = inputs_dict["pixel_values"]
+            if attention_mask is None:
+                attention_mask = torch.ones_like(input_ids)
+
+            model = model_class(config).to(torch_device).eval()
+            signature = inspect.signature(model.forward).parameters.keys()
+
+            # no cache as some models require special cache classes to be init outside forward
+            model.generation_config.use_cache = False
+
+            # Without padding
+            model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature)
+            next_logits_wo_padding = model(**model_kwargs, pixel_values=pixel_values).logits[:, -1, :]
+
+            # With left-padding (length 32)
+            # can hardcode pad_token to be 0 as we'll do attn masking anyway
+            pad_token_id = (
+                config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0
+            )
+            pad_size = (input_ids.shape[0], 32)
+            padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id
+            padded_input_ids = torch.cat((padding, input_ids), dim=1)
+            padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1)
+            model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature)
+            next_logits_with_padding = model(**model_kwargs, pixel_values=pixel_values).logits[:, -1, :]
+
+            # They should result in very similar logits
+            self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5))
+
+    @unittest.skip("BLIP2 cannot generate only from input ids, and requires pixel values in all cases to be present")
+    @parameterized.expand([("greedy", 1), ("beam search", 2)])
+    def test_generate_from_inputs_embeds(self, _, num_beams):
+        pass
+

 # this class is based on `T5ModelTester` found in tests/models/t5/test_modeling_t5.py
 class Blip2TextModelTester:
@@ -1780,6 +1987,7 @@ class Blip2ModelIntegrationTest(unittest.TestCase):
        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()

        # Test output
+        print(predictions[0].tolist(), generated_text)
        self.assertEqual(predictions[0].tolist(), [2, 102, 693, 2828, 15, 5, 4105, 19, 10, 2335, 50118])
        self.assertEqual("a woman sitting on the beach with a dog", generated_text)

@@ -1794,9 +2002,9 @@ class Blip2ModelIntegrationTest(unittest.TestCase):
        # Test output
        self.assertEqual(
            predictions[0].tolist(),
-            [2, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118],
+            [2, 45641, 35, 61, 343, 16, 42, 116, 31652, 35, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118],
        )
-        self.assertEqual(generated_text, "it's not a city, it's a beach")
+        self.assertEqual(generated_text, "Question: which city is this? Answer: it's not a city, it's a beach")

    def test_inference_interpolate_pos_encoding(self):
        processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
@@ -1905,9 +2113,9 @@ class Blip2ModelIntegrationTest(unittest.TestCase):
        # Test output
        self.assertEqual(
            predictions[0].tolist(),
-            [2, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118],
+            [2, 45641, 35, 61, 343, 16, 42, 116, 31652, 35, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118],
        )
-        self.assertEqual(generated_text, "it's not a city, it's a beach")
+        self.assertEqual(generated_text, "Question: which city is this? Answer: it's not a city, it's a beach")

    @require_torch_multi_accelerator
    def test_inference_t5_multi_accelerator(self):
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -19,7 +19,9 @@ import tempfile
 import unittest

 import numpy as np
+import pytest
 import requests
+from parameterized import parameterized

 from transformers import (
    CONFIG_MAPPING,
@@ -320,7 +322,7 @@ class InstructBlipTextModelDecoderOnlyTester:
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
+        max_position_embeddings=100,
        eos_token_id=2,
        pad_token_id=1,
        bos_token_id=0,
@@ -384,7 +386,14 @@ class InstructBlipTextModelDecoderOnlyTester:
 # this model tester uses a decoder-only language model (OPT)
 class InstructBlipForConditionalGenerationDecoderOnlyModelTester:
    def __init__(
-        self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10
+        self,
+        parent,
+        vision_kwargs=None,
+        qformer_kwargs=None,
+        text_kwargs=None,
+        is_training=True,
+        num_query_tokens=10,
+        image_token_index=4,
    ):
        if vision_kwargs is None:
            vision_kwargs = {}
@@ -398,9 +407,10 @@ class InstructBlipForConditionalGenerationDecoderOnlyModelTester:
        self.qformer_model_tester = InstructBlipQFormerModelTester(parent, **qformer_kwargs)
        self.text_model_tester = InstructBlipTextModelDecoderOnlyTester(parent, **text_kwargs)
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.seq_length = self.text_model_tester.seq_length  # need seq_length for common tests
+        self.seq_length = self.text_model_tester.seq_length + num_query_tokens  # need seq_length for common tests
        self.is_training = is_training
        self.num_query_tokens = num_query_tokens
+        self.image_token_index = image_token_index

    def prepare_config_and_inputs(self):
        _, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
@@ -408,6 +418,14 @@ class InstructBlipForConditionalGenerationDecoderOnlyModelTester:
        _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()

        config = self.get_config()
+        vision_tokens = (
+            torch.ones((input_ids.shape[0], self.num_query_tokens), device=torch_device, dtype=input_ids.dtype)
+            * self.image_token_index
+        )
+        input_ids[input_ids == self.image_token_index] = self.text_model_tester.pad_token_id
+        input_ids = torch.cat([vision_tokens, input_ids], dim=-1)
+        vision_attention_mask = torch.ones_like(vision_tokens)
+        attention_mask = torch.cat([vision_attention_mask, attention_mask], dim=-1)

        return config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values

@@ -417,6 +435,7 @@ class InstructBlipForConditionalGenerationDecoderOnlyModelTester:
            qformer_config=self.qformer_model_tester.get_config(),
            text_config=self.text_model_tester.get_config(),
            num_query_tokens=self.num_query_tokens,
+            image_token_index=self.image_token_index,
        )

    def create_and_check_for_conditional_generation(
@@ -455,6 +474,7 @@ class InstructBlipForConditionalGenerationDecoderOnlyModelTester:
@require_torch
 class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
    all_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else ()
+    all_generative_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = {"image-text-to-text": InstructBlipForConditionalGeneration}
    fx_compatible = False
    test_head_masking = False
@@ -532,6 +552,199 @@ class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, Gene
        model = InstructBlipForConditionalGeneration.from_pretrained(model_name)
        self.assertIsNotNone(model)

+    # overwrite because InstructBLIP internally calls LM.generate() with embeds thus it cannot operate in no cache format
+    def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1):
+        use_cache = True  # force this to be True in case False is passed
+
+        input_batch_size = int(output.sequences.shape[0] / num_return_sequences)
+        internal_batch_size = (
+            input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences
+        )
+
+        seq_length = getattr(self.model_tester, "seq_length", None)
+        seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length)
+        seq_length = getattr(self.model_tester, "text_seq_length", seq_length)
+
+        config = config.text_config if hasattr(config, "text_config") else config
+
+        gen_len = (
+            output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
+        )
+
+        # in some models we subsample the sequence length in inner layers
+        if hasattr(self.model_tester, "get_subsampled_output_lengths"):
+            seq_length = self.model_tester.get_subsampled_output_lengths(seq_length)
+
+        # scores
+        self._check_scores(internal_batch_size, output.scores, length=gen_len, config=config)
+
+        # unprocessed logits
+        self._check_logits(internal_batch_size, output.logits, config=config)
+
+        # Attentions
+        if self.has_attentions:
+            if config.is_encoder_decoder:
+                # encoder
+                self._check_encoder_attention_for_generate(
+                    output.encoder_attentions, input_batch_size, config, seq_length
+                )
+                # decoder
+                self._check_attentions_for_generate(
+                    internal_batch_size,
+                    output.decoder_attentions,
+                    min_length=1,
+                    max_length=output.sequences.shape[-1],
+                    config=config,
+                    use_cache=use_cache,
+                )
+            else:
+                # if use_cache first input is equal to no use_cache, so skip here
+                attentions = output.attentions if not use_cache else output.attentions[1:]
+                min_length = seq_length if not use_cache else seq_length + 1
+                self._check_attentions_for_generate(
+                    internal_batch_size,
+                    attentions=attentions,
+                    min_length=min_length,
+                    max_length=output.sequences.shape[-1],
+                    config=config,
+                    use_cache=use_cache,
+                )
+
+        # Hidden States
+        if config.is_encoder_decoder:
+            # encoder
+            self._check_encoder_hidden_states_for_generate(
+                output.encoder_hidden_states, input_batch_size, config, seq_length
+            )
+
+            # decoder
+            self._check_hidden_states_for_generate(
+                internal_batch_size,
+                output.decoder_hidden_states,
+                min_length=1,
+                max_length=output.sequences.shape[-1],
+                config=config,
+                use_cache=use_cache,
+            )
+        else:
+            # if use_cache first input is equal to no use_cache, so skip here
+            hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:]
+            min_length = seq_length if not use_cache else seq_length + 1
+            self._check_hidden_states_for_generate(
+                internal_batch_size,
+                hidden_states,
+                min_length=min_length,
+                max_length=output.sequences.shape[-1],
+                config=config,
+                use_cache=use_cache,
+            )
+
+        # Past Key Value States
+        if use_cache:
+            past_key_values = output.past_key_values
+            past_sequence_length = output.sequences.shape[-1] - 1
+            self._check_past_key_values_for_generate(
+                internal_batch_size,
+                past_key_values,
+                seq_length=past_sequence_length,
+                config=config,
+            )
+
+    # overwrite because InstructBLIP cannot generate only from input ids, and requires `pixel` values and `qformer_input_ids` in all cases to be present
+    @pytest.mark.generate
+    def test_left_padding_compatibility(self):
+        # NOTE: left-padding results in small numerical differences. This is expected.
+        # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535
+
+        # First, filter out models that don't support left padding
+        # - The model must have generative capabilities
+        if len(self.all_generative_model_classes) == 0:
+            self.skipTest(reason="No generative architecture available for this model.")
+
+        # - The model must support padding
+        if not self.has_attentions:
+            self.skipTest(reason="This model doesn't support padding.")
+
+        # - The model must be a decoder-only architecture (encoder-based architectures use right-padding)
+        decoder_only_classes = []
+        for model_class in self.all_generative_model_classes:
+            config, _ = self.prepare_config_and_inputs_for_generate()
+            if config.is_encoder_decoder:
+                continue
+            else:
+                decoder_only_classes.append(model_class)
+        if len(decoder_only_classes) == 0:
+            self.skipTest(reason="No decoder-only architecture available for this model.")
+
+        # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't
+        #   added support for it yet. We skip these models for now.
+        has_encoder_attributes = any(
+            attr_name
+            for attr_name in config.to_dict().keys()
+            if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size"
+        )
+        if has_encoder_attributes:
+            self.skipTest(
+                reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding."
+            )
+
+        # Then, test left-padding
+        def _prepare_model_kwargs(input_ids, attention_mask, signature):
+            model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask}
+            if "position_ids" in signature:
+                position_ids = torch.cumsum(attention_mask, dim=-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                model_kwargs["position_ids"] = position_ids
+            if "cache_position" in signature:
+                cache_position = torch.arange(input_ids.shape[-1], device=torch_device)
+                model_kwargs["cache_position"] = cache_position
+            return model_kwargs
+
+        for model_class in decoder_only_classes:
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            input_ids = inputs_dict["input_ids"]
+            attention_mask = inputs_dict.get("attention_mask")
+            pixel_values = inputs_dict["pixel_values"]
+            qformer_input_ids = inputs_dict["qformer_input_ids"]
+            if attention_mask is None:
+                attention_mask = torch.ones_like(input_ids)
+
+            model = model_class(config).to(torch_device).eval()
+            signature = inspect.signature(model.forward).parameters.keys()
+
+            # no cache as some models require special cache classes to be init outside forward
+            model.generation_config.use_cache = False
+
+            # Without padding
+            model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature)
+            next_logits_wo_padding = model(
+                **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids
+            ).logits[:, -1, :]
+
+            # With left-padding (length 32)
+            # can hardcode pad_token to be 0 as we'll do attn masking anyway
+            pad_token_id = (
+                config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0
+            )
+            pad_size = (input_ids.shape[0], 32)
+            padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id
+            padded_input_ids = torch.cat((padding, input_ids), dim=1)
+            padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1)
+            model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature)
+            next_logits_with_padding = model(
+                **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids
+            ).logits[:, -1, :]
+
+            # They should result in very similar logits
+            self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5))
+
+    @unittest.skip(
+        "InstructBLIP cannot generate only from input ids, and requires pixel values in all cases to be present"
+    )
+    @parameterized.expand([("greedy", 1), ("beam search", 2)])
+    def test_generate_from_inputs_embeds(self, _, num_beams):
+        pass
+
    @require_torch_sdpa
    def test_sdpa_can_dispatch_composite_models(self):
        """
@@ -632,12 +845,12 @@ class InstructBlipModelIntegrationTest(unittest.TestCase):
        outputs = model.generate(**inputs, max_new_tokens=30)
        generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()

-        expected_outputs = [2, 450, 22910, 9565, 310, 445, 1967, 338, 393, 263, 767, 338, 13977, 292, 22095, 373, 278, 1250, 310, 263, 13328, 20134, 29963, 1550, 372, 338, 19500, 1623, 263, 19587, 4272]  # fmt: off
+        expected_outputs = [2, 1724, 338, 22910, 1048, 445, 1967, 29973, 450, 22910, 9565, 310, 445, 1967, 338, 393, 263, 767, 338, 13977, 292, 22095, 373, 278, 1250, 310, 263, 13328, 20134, 29963, 1550, 19500, 373, 263, 19587, 4272, 11952, 29889]  # fmt: off

        self.assertEqual(outputs[0].tolist(), expected_outputs)
        self.assertEqual(
            generated_text,
-            "The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV while it is driving down a busy city",
+            "What is unusual about this image? The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV while driving on a busy city street.",
        )

    def test_inference_flant5_xl(self):
--- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
@@ -19,7 +19,9 @@ import tempfile
 import unittest

 import numpy as np
+import pytest
 from huggingface_hub import hf_hub_download
+from parameterized import parameterized

 from transformers import (
    CONFIG_MAPPING,
@@ -398,7 +400,14 @@ class InstructBlipVideoTextModelDecoderOnlyTester:
 # this model tester uses a decoder-only language model (OPT)
 class InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester:
    def __init__(
-        self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10
+        self,
+        parent,
+        vision_kwargs=None,
+        qformer_kwargs=None,
+        text_kwargs=None,
+        is_training=True,
+        num_query_tokens=10,
+        video_token_index=4,
    ):
        if vision_kwargs is None:
            vision_kwargs = {}
@@ -412,17 +421,30 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester:
        self.qformer_model_tester = InstructBlipVideoQFormerModelTester(parent, **qformer_kwargs)
        self.text_model_tester = InstructBlipVideoTextModelDecoderOnlyTester(parent, **text_kwargs)
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.seq_length = self.text_model_tester.seq_length  # need seq_length for common tests
+        self.frames = self.vision_model_tester.frames
+        # need seq_length for common tests
+        self.seq_length = self.text_model_tester.seq_length + (num_query_tokens * self.frames)
        self.is_training = is_training
        self.num_query_tokens = num_query_tokens
+        self.video_token_index = video_token_index

    def prepare_config_and_inputs(self):
        _, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
        _, _, _, qformer_input_ids, qformer_attention_mask = self.qformer_model_tester.prepare_config_and_inputs()
        _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        frames = self.vision_model_tester.frames
        _, c, h, w = pixel_values.shape
-        pixel_values = pixel_values.reshape(-1, frames, c, h, w)
+        pixel_values = pixel_values.reshape(-1, self.frames, c, h, w)
+
+        vision_tokens = (
+            torch.ones(
+                (input_ids.shape[0], self.num_query_tokens * self.frames), device=torch_device, dtype=input_ids.dtype
+            )
+            * self.video_token_index
+        )
+        input_ids[input_ids == self.video_token_index] = self.text_model_tester.pad_token_id
+        input_ids = torch.cat([vision_tokens, input_ids], dim=-1)
+        vision_attention_mask = torch.ones_like(vision_tokens)
+        attention_mask = torch.cat([vision_attention_mask, attention_mask], dim=-1)

        config = self.get_config()

@@ -434,6 +456,7 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester:
            qformer_config=self.qformer_model_tester.get_config(),
            text_config=self.text_model_tester.get_config(),
            num_query_tokens=self.num_query_tokens,
+            video_token_index=self.video_token_index,
        )

    def create_and_check_for_conditional_generation(
@@ -476,6 +499,7 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(
    ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
 ):
    all_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else ()
+    all_generative_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else ()
    fx_compatible = False
    test_head_masking = False
    test_pruning = False
@@ -552,6 +576,199 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(
        model = InstructBlipVideoForConditionalGeneration.from_pretrained(model_name)
        self.assertIsNotNone(model)

+    # overwrite because InstructBLIPVideo internally calls LM.generate() with embeds thus it cannot operate in no cache format
+    def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1):
+        use_cache = True  # force this to be True in case False is passed
+
+        input_batch_size = int(output.sequences.shape[0] / num_return_sequences)
+        internal_batch_size = (
+            input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences
+        )
+
+        seq_length = getattr(self.model_tester, "seq_length", None)
+        seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length)
+        seq_length = getattr(self.model_tester, "text_seq_length", seq_length)
+
+        config = config.text_config if hasattr(config, "text_config") else config
+
+        gen_len = (
+            output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
+        )
+
+        # in some models we subsample the sequence length in inner layers
+        if hasattr(self.model_tester, "get_subsampled_output_lengths"):
+            seq_length = self.model_tester.get_subsampled_output_lengths(seq_length)
+
+        # scores
+        self._check_scores(internal_batch_size, output.scores, length=gen_len, config=config)
+
+        # unprocessed logits
+        self._check_logits(internal_batch_size, output.logits, config=config)
+
+        # Attentions
+        if self.has_attentions:
+            if config.is_encoder_decoder:
+                # encoder
+                self._check_encoder_attention_for_generate(
+                    output.encoder_attentions, input_batch_size, config, seq_length
+                )
+                # decoder
+                self._check_attentions_for_generate(
+                    internal_batch_size,
+                    output.decoder_attentions,
+                    min_length=1,
+                    max_length=output.sequences.shape[-1],
+                    config=config,
+                    use_cache=use_cache,
+                )
+            else:
+                # if use_cache first input is equal to no use_cache, so skip here
+                attentions = output.attentions if not use_cache else output.attentions[1:]
+                min_length = seq_length if not use_cache else seq_length + 1
+                self._check_attentions_for_generate(
+                    internal_batch_size,
+                    attentions=attentions,
+                    min_length=min_length,
+                    max_length=output.sequences.shape[-1],
+                    config=config,
+                    use_cache=use_cache,
+                )
+
+        # Hidden States
+        if config.is_encoder_decoder:
+            # encoder
+            self._check_encoder_hidden_states_for_generate(
+                output.encoder_hidden_states, input_batch_size, config, seq_length
+            )
+
+            # decoder
+            self._check_hidden_states_for_generate(
+                internal_batch_size,
+                output.decoder_hidden_states,
+                min_length=1,
+                max_length=output.sequences.shape[-1],
+                config=config,
+                use_cache=use_cache,
+            )
+        else:
+            # if use_cache first input is equal to no use_cache, so skip here
+            hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:]
+            min_length = seq_length if not use_cache else seq_length + 1
+            self._check_hidden_states_for_generate(
+                internal_batch_size,
+                hidden_states,
+                min_length=min_length,
+                max_length=output.sequences.shape[-1],
+                config=config,
+                use_cache=use_cache,
+            )
+
+        # Past Key Value States
+        if use_cache:
+            past_key_values = output.past_key_values
+            past_sequence_length = output.sequences.shape[-1] - 1
+            self._check_past_key_values_for_generate(
+                internal_batch_size,
+                past_key_values,
+                seq_length=past_sequence_length,
+                config=config,
+            )
+
+    # overwrite because InstructBLIPVideo cannot generate only from input ids, and requires `pixel` values and `qformer_input_ids` in all cases to be present
+    @pytest.mark.generate
+    def test_left_padding_compatibility(self):
+        # NOTE: left-padding results in small numerical differences. This is expected.
+        # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535
+
+        # First, filter out models that don't support left padding
+        # - The model must have generative capabilities
+        if len(self.all_generative_model_classes) == 0:
+            self.skipTest(reason="No generative architecture available for this model.")
+
+        # - The model must support padding
+        if not self.has_attentions:
+            self.skipTest(reason="This model doesn't support padding.")
+
+        # - The model must be a decoder-only architecture (encoder-based architectures use right-padding)
+        decoder_only_classes = []
+        for model_class in self.all_generative_model_classes:
+            config, _ = self.prepare_config_and_inputs_for_generate()
+            if config.is_encoder_decoder:
+                continue
+            else:
+                decoder_only_classes.append(model_class)
+        if len(decoder_only_classes) == 0:
+            self.skipTest(reason="No decoder-only architecture available for this model.")
+
+        # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't
+        #   added support for it yet. We skip these models for now.
+        has_encoder_attributes = any(
+            attr_name
+            for attr_name in config.to_dict().keys()
+            if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size"
+        )
+        if has_encoder_attributes:
+            self.skipTest(
+                reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding."
+            )
+
+        # Then, test left-padding
+        def _prepare_model_kwargs(input_ids, attention_mask, signature):
+            model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask}
+            if "position_ids" in signature:
+                position_ids = torch.cumsum(attention_mask, dim=-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                model_kwargs["position_ids"] = position_ids
+            if "cache_position" in signature:
+                cache_position = torch.arange(input_ids.shape[-1], device=torch_device)
+                model_kwargs["cache_position"] = cache_position
+            return model_kwargs
+
+        for model_class in decoder_only_classes:
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            input_ids = inputs_dict["input_ids"]
+            attention_mask = inputs_dict.get("attention_mask")
+            pixel_values = inputs_dict["pixel_values"]
+            qformer_input_ids = inputs_dict["qformer_input_ids"]
+            if attention_mask is None:
+                attention_mask = torch.ones_like(input_ids)
+
+            model = model_class(config).to(torch_device).eval()
+            signature = inspect.signature(model.forward).parameters.keys()
+
+            # no cache as some models require special cache classes to be init outside forward
+            model.generation_config.use_cache = False
+
+            # Without padding
+            model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature)
+            next_logits_wo_padding = model(
+                **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids
+            ).logits[:, -1, :]
+
+            # With left-padding (length 32)
+            # can hardcode pad_token to be 0 as we'll do attn masking anyway
+            pad_token_id = (
+                config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0
+            )
+            pad_size = (input_ids.shape[0], 32)
+            padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id
+            padded_input_ids = torch.cat((padding, input_ids), dim=1)
+            padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1)
+            model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature)
+            next_logits_with_padding = model(
+                **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids
+            ).logits[:, -1, :]
+
+            # They should result in very similar logits
+            self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5))
+
+    @unittest.skip(
+        "InstructBLIPVideo cannot generate only from input ids, and requires pixel values in all cases to be present"
+    )
+    @parameterized.expand([("greedy", 1), ("beam search", 2)])
+    def test_generate_from_inputs_embeds(self, _, num_beams):
+        pass
+
    @require_torch_sdpa
    def test_sdpa_can_dispatch_composite_models(self):
        """
@@ -643,7 +860,7 @@ class InstructBlipVideoModelIntegrationTest(unittest.TestCase):
        generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        self.assertEqual(
            generated_text,
-            "a baby girl wearing glasses is reading a book on the bed 1080p",
+            "Explain what is happening in this short video. a baby girl wearing glasses is reading a book on the bed 1080p",
        )

    def test_expansion_in_processing(self):