From 30409af6e1b2b5efb6d9932b3e3b4ce20cfdb30e Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 3 Aug 2023 11:01:10 +0100
Subject: [PATCH] Update InstructBLIP & Align values after rescale update
 (#25209)

* Update InstructBLIP values
Note: the tests are not independent. Running the test independentely produces different logits compared to running all the integration tests

* Update test values after rescale update

* Remove left over commented out code

* Revert to previous rescaling logic

* Update rescale tests
---
 .../models/efficientnet/image_processing_efficientnet.py  | 8 ++++----
 src/transformers/models/vivit/image_processing_vivit.py   | 8 ++++----
 .../efficientnet/test_image_processing_efficientnet.py    | 4 ++--
 tests/models/instructblip/test_modeling_instructblip.py   | 6 +++---
 tests/models/vivit/test_image_processing_vivit.py         | 4 ++--
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/efficientnet/image_processing_efficientnet.py b/src/transformers/models/efficientnet/image_processing_efficientnet.py
index 8873a80069..9a19ab1ff6 100644
--- a/src/transformers/models/efficientnet/image_processing_efficientnet.py
+++ b/src/transformers/models/efficientnet/image_processing_efficientnet.py
@@ -155,10 +155,11 @@ class EfficientNetImageProcessor(BaseImageProcessor):
         """
         Rescale an image by a scale factor.
 
-        If offset is True, the image is rescaled between [-1, 1].
-            image = image * scale * 2 - 1
+        If `offset` is `True`, the image has its values rescaled by `scale` and then offset by 1. If `scale` is
+        1/127.5, the image is rescaled between [-1, 1].
+            image = image * scale - 1
 
-        If offset is False, the image is rescaled between [0, 1].
+        If `offset` is `False`, and `scale` is 1/255, the image is rescaled between [0, 1].
             image = image * scale
 
         Args:
@@ -171,7 +172,6 @@ class EfficientNetImageProcessor(BaseImageProcessor):
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
-        scale = scale * 2 if offset else scale
         rescaled_image = rescale(image, scale=scale, data_format=data_format, **kwargs)
 
         if offset:
diff --git a/src/transformers/models/vivit/image_processing_vivit.py b/src/transformers/models/vivit/image_processing_vivit.py
index 41666e9999..0790c5d82b 100644
--- a/src/transformers/models/vivit/image_processing_vivit.py
+++ b/src/transformers/models/vivit/image_processing_vivit.py
@@ -179,10 +179,11 @@ class VivitImageProcessor(BaseImageProcessor):
         """
         Rescale an image by a scale factor.
 
-        If offset is True, the image is rescaled between [-1, 1].
-            image = image * scale * 2 - 1
+        If `offset` is `True`, the image has its values rescaled by `scale` and then offset by 1. If `scale` is
+        1/127.5, the image is rescaled between [-1, 1].
+            image = image * scale - 1
 
-        If offset is False, the image is rescaled between [0, 1].
+        If `offset` is `False`, and `scale` is 1/255, the image is rescaled between [0, 1].
             image = image * scale
 
         Args:
@@ -195,7 +196,6 @@ class VivitImageProcessor(BaseImageProcessor):
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
-        scale = scale * 2 if offset else scale
         rescaled_image = rescale(image, scale=scale, data_format=data_format, **kwargs)
 
         if offset:
diff --git a/tests/models/efficientnet/test_image_processing_efficientnet.py b/tests/models/efficientnet/test_image_processing_efficientnet.py
index 3e427474f6..bc65e7acbf 100644
--- a/tests/models/efficientnet/test_image_processing_efficientnet.py
+++ b/tests/models/efficientnet/test_image_processing_efficientnet.py
@@ -200,8 +200,8 @@ class EfficientNetImageProcessorTest(ImageProcessingSavingTestMixin, unittest.Te
 
         image_processor = self.image_processing_class(**self.image_processor_dict)
 
-        rescaled_image = image_processor.rescale(image, scale=1 / 255)
-        expected_image = (image * (2 / 255.0)).astype(np.float32) - 1
+        rescaled_image = image_processor.rescale(image, scale=1 / 127.5)
+        expected_image = (image * (1 / 127.5)).astype(np.float32) - 1
         self.assertTrue(np.allclose(rescaled_image, expected_image))
 
         rescaled_image = image_processor.rescale(image, scale=1 / 255, offset=False)
diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py
index 49d780918c..f8ce2b22e8 100644
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -538,7 +538,7 @@ class InstructBlipModelIntegrationTest(unittest.TestCase):
             logits = model(**inputs).logits
 
         expected_slice = torch.tensor(
-            [[-3.5020, -12.3281, 8.4453], [-5.1406, -11.9609, 7.8711], [-4.0430, -13.4375, 9.1172]],
+            [[-3.4727, -11.8203, 8.3828], [-5.1172, -11.3438, 7.7656], [-4.0742, -13.4688, 9.1953]],
             device=torch_device,
         )
         self.assertTrue(torch.allclose(logits[0, :3, :3].float(), expected_slice, atol=1e-3))
@@ -548,12 +548,12 @@ class InstructBlipModelIntegrationTest(unittest.TestCase):
         generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
 
         # fmt: off
-        expected_outputs = [    2,   450, 22910,  9565,   310,   445,  1967,   338,   393,   263, 767,   338, 13977,   292, 22095,   373,   278,  1250,   310,   263, 13328, 20134, 29963,  1550, 19500,  1623,   263, 19587,  4272, 11952, 29889]
+        expected_outputs = [2, 450, 22910, 9565, 310, 445, 1967, 338, 393, 263, 767, 338, 13977, 292, 22095, 373, 278, 1250, 310, 263, 13328, 20134, 29963, 1550, 19500, 373, 263, 19587, 4272, 11952, 29889]
         # fmt: on
         self.assertEqual(outputs[0].tolist(), expected_outputs)
         self.assertEqual(
             generated_text,
-            "The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV while driving down a busy city street.",
+            "The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV while driving on a busy city street.",
         )
 
     def test_inference_flant5_xl(self):
diff --git a/tests/models/vivit/test_image_processing_vivit.py b/tests/models/vivit/test_image_processing_vivit.py
index 0b445cf474..6954734748 100644
--- a/tests/models/vivit/test_image_processing_vivit.py
+++ b/tests/models/vivit/test_image_processing_vivit.py
@@ -219,8 +219,8 @@ class VivitImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase
 
         image_processor = self.image_processing_class(**self.image_processor_dict)
 
-        rescaled_image = image_processor.rescale(image, scale=1 / 255)
-        expected_image = (image * (2 / 255.0)).astype(np.float32) - 1
+        rescaled_image = image_processor.rescale(image, scale=1 / 127.5)
+        expected_image = (image * (1 / 127.5)).astype(np.float32) - 1
         self.assertTrue(np.allclose(rescaled_image, expected_image))
 
         rescaled_image = image_processor.rescale(image, scale=1 / 255, offset=False)