From b469ebc5cfd34846ab02e9038e60bc73b4c74a3a Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Thu, 21 Mar 2024 21:33:18 +0500 Subject: [PATCH] Prepend `bos token` to Blip generations (#29642) * prepend "bos" to blip generation * minor changes * Update src/transformers/models/blip_2/modeling_blip_2.py Co-authored-by: Joao Gante * Update src/transformers/models/instructblip/modeling_instructblip.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * add generation tester mixin --------- Co-authored-by: Joao Gante Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- .../models/blip_2/modeling_blip_2.py | 16 +++++++++++- .../instructblip/modeling_instructblip.py | 25 +++++++++++++------ tests/models/blip_2/test_modeling_blip_2.py | 8 +++--- .../test_modeling_instructblip.py | 3 ++- 4 files changed, 39 insertions(+), 13 deletions(-) diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index 3e63fac66f..c776df1bc0 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -1828,8 +1828,10 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel): inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) # add image_embeds length to max_length, so that the final max_length in counted only on token embeds + # -1 is to account for the prepended BOS after `generate.` + # TODO (joao, raushan): refactor `generate` to avoid these operations with VLMs if not self.language_model.config.is_encoder_decoder: - generate_kwargs["max_length"] = generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] + generate_kwargs["max_length"] = generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1 generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1] outputs = self.language_model.generate( @@ -1838,4 +1840,16 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel): **generate_kwargs, ) + # this is a temporary workaround to be consistent with other generation models and + # have BOS as the first token, even though under the hood we are calling LM with embeds + if not self.language_model.config.is_encoder_decoder: + bos_tokens = ( + torch.LongTensor([[self.config.text_config.bos_token_id]]) + .repeat(batch_size, 1) + .to(image_embeds.device) + ) + if not isinstance(outputs, torch.Tensor): + outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1) + else: + outputs = torch.cat([bos_tokens, outputs], dim=-1) return outputs diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py index da0b02551f..ba78b9143d 100644 --- a/src/transformers/models/instructblip/modeling_instructblip.py +++ b/src/transformers/models/instructblip/modeling_instructblip.py @@ -1538,8 +1538,9 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel): inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) # add image_embeds length to max_length, so that the final max_length in counted only on token embeds + # -1 is to account for the prepended BOS after `generate.` if not self.language_model.config.is_encoder_decoder: - generate_kwargs["max_length"] = generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] + generate_kwargs["max_length"] = generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1 generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1] outputs = self.language_model.generate( @@ -1548,13 +1549,21 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel): **generate_kwargs, ) - # the InstructBLIP authors used inconsistent tokenizer/model files during training, - # with the tokenizer's bos token being set to which has ID=2, - # whereas the model's text config has bos token id = 0 - if self.config.text_config.architectures[0] == "LLaMAForCausalLM": - if isinstance(outputs, torch.Tensor): - outputs[outputs == 0] = 2 + # this is a temporary workaround to be consistent with other generation models and + # have BOS as the first token, even though under the hood we are calling LM with embeds + if not self.language_model.config.is_encoder_decoder: + # the InstructBLIP authors used inconsistent tokenizer/model files during training, + # with the tokenizer's bos token being set to which has ID=2, + # whereas the model's text config has bos token id = 0 + bos_token_id = ( + 2 + if self.config.text_config.architectures[0] == "LLaMAForCausalLM" + else self.config.text_config.bos_token_id + ) + bos_tokens = torch.LongTensor([[bos_token_id]]).repeat(batch_size, 1).to(image_embeds.device) + if not isinstance(outputs, torch.Tensor): + outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1) else: - outputs.sequences[outputs.sequences == 0] = 2 + outputs = torch.cat([bos_tokens, outputs], dim=-1) return outputs diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index cffb7a1fe7..4abbba22f5 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -32,6 +32,7 @@ from transformers.testing_utils import ( ) from transformers.utils import is_torch_available, is_vision_available +from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( ModelTesterMixin, @@ -434,7 +435,7 @@ class Blip2ForConditionalGenerationDecoderOnlyModelTester: @require_torch -class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, unittest.TestCase): +class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): all_model_classes = (Blip2ForConditionalGeneration,) if is_torch_available() else () fx_compatible = False test_head_masking = False @@ -683,7 +684,7 @@ class Blip2ModelTester: @require_torch -class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): +class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMixin, unittest.TestCase): all_model_classes = (Blip2ForConditionalGeneration, Blip2Model) if is_torch_available() else () pipeline_model_mapping = ( { @@ -869,7 +870,8 @@ class Blip2ModelIntegrationTest(unittest.TestCase): prompt = "Question: which city is this? Answer:" inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16) - predictions = model.generate(**inputs) + # max_length for BLIP includes prompt length from now on, use max_new_tokens + predictions = model.generate(**inputs, max_new_tokens=11) generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip() # Test output diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py index ffc9c6eb0e..9ed95b56b6 100644 --- a/tests/models/instructblip/test_modeling_instructblip.py +++ b/tests/models/instructblip/test_modeling_instructblip.py @@ -39,6 +39,7 @@ from transformers.testing_utils import ( ) from transformers.utils import is_torch_available, is_vision_available +from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( ModelTesterMixin, @@ -452,7 +453,7 @@ class InstructBlipForConditionalGenerationDecoderOnlyModelTester: @require_torch -class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, unittest.TestCase): +class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): all_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else () fx_compatible = False test_head_masking = False