From eec76042f438226429d9f1b545a49a483ce33abb Mon Sep 17 00:00:00 2001 From: raghavanone <115454562+raghavanone@users.noreply.github.com> Date: Tue, 28 Feb 2023 20:24:08 +0530 Subject: [PATCH] Fix the issue of blip model returning loss even when the label is not provided. (#21811) * Fix the issue of blip model returning loss even when the label is not provoided * Fix ruff failure * Incorporate PR feedbacks * Incorporate PR feedbacks * Incorporate PR feedbacks * Incorporate PR feedbacks --- src/transformers/models/blip/modeling_blip.py | 8 +------- tests/models/blip/test_modeling_blip.py | 3 ++- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py index 7f1b3412b6..939bf26cc1 100644 --- a/src/transformers/models/blip/modeling_blip.py +++ b/src/transformers/models/blip/modeling_blip.py @@ -990,7 +990,7 @@ class BlipForConditionalGeneration(BlipPreTrainedModel): >>> outputs = model(**inputs) ```""" - batch_size = pixel_values.shape[0] + return_dict = return_dict if return_dict is not None else self.config.use_return_dict vision_outputs = self.vision_model( @@ -1002,12 +1002,6 @@ class BlipForConditionalGeneration(BlipPreTrainedModel): image_embeds = vision_outputs[0] - if input_ids is None: - input_ids = torch.LongTensor([[self.decoder_input_ids] * batch_size]).to(image_embeds.device) - - if labels is None: - labels = input_ids.masked_fill(input_ids == self.decoder_pad_token_id, -100) - outputs = self.text_decoder( input_ids=input_ids, attention_mask=attention_mask, diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py index 3a847aaff0..9a3dd0e96e 100644 --- a/tests/models/blip/test_modeling_blip.py +++ b/tests/models/blip/test_modeling_blip.py @@ -24,7 +24,7 @@ import numpy as np import requests from transformers import BlipConfig, BlipTextConfig, BlipVisionConfig -from transformers.testing_utils import require_torch, require_vision, slow, torch_device +from transformers.testing_utils import require_torch, require_torch_gpu, require_vision, slow, torch_device from transformers.utils import is_torch_available, is_vision_available from ...test_configuration_common import ConfigTester @@ -1111,6 +1111,7 @@ class BlipModelIntegrationTest(unittest.TestCase): [30522, 1037, 3861, 1997, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102], ) + @require_torch_gpu def test_inference_image_captioning_fp16(self): model = BlipForConditionalGeneration.from_pretrained( "Salesforce/blip-image-captioning-base", torch_dtype=torch.float16