From 1d7773594754457ed4a79cf6d98bcaabea5bff51 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 20 Dec 2023 20:55:51 +0000
Subject: [PATCH] Fix yolos resizing (#27663)

* Fix yolos resizing

* Update tests

* Add a test
---
 .../models/detr/image_processing_detr.py      |  1 +
 .../models/yolos/image_processing_yolos.py    | 22 +++++-----
 .../yolos/test_image_processing_yolos.py      | 44 ++++++++++++++-----
 3 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py
index 24c36c5d10..98fce25624 100644
--- a/src/transformers/models/detr/image_processing_detr.py
+++ b/src/transformers/models/detr/image_processing_detr.py
@@ -82,6 +82,7 @@ logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
+# From the original repo: https://github.com/facebookresearch/detr/blob/3af9fa878e73b6894ce3596450a8d9b89d918ca9/datasets/transforms.py#L76
 def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]:
     """
     Computes the output image size given the input image size and the desired output size.
diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py
index 3b0c635c0e..6b9aba42e5 100644
--- a/src/transformers/models/yolos/image_processing_yolos.py
+++ b/src/transformers/models/yolos/image_processing_yolos.py
@@ -99,7 +99,6 @@ def get_max_height_width(
     return (max_height, max_width)
 
 
-# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
 def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]:
     """
     Computes the output image size given the input image size and the desired output size.
@@ -119,16 +118,17 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
         if max_original_size / min_original_size * size > max_size:
             size = int(round(max_size * min_original_size / max_original_size))
 
-    if (height <= width and height == size) or (width <= height and width == size):
-        return height, width
-
-    if width < height:
-        ow = size
-        oh = int(size * height / width)
-    else:
-        oh = size
-        ow = int(size * width / height)
-    return (oh, ow)
+    if width < height and width != size:
+        height = int(size * height / width)
+        width = size
+    elif height < width and height != size:
+        width = int(size * width / height)
+        height = size
+    width_mod = np.mod(width, 16)
+    height_mod = np.mod(height, 16)
+    width = width - width_mod
+    height = height - height_mod
+    return (height, width)
 
 
 # Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
diff --git a/tests/models/yolos/test_image_processing_yolos.py b/tests/models/yolos/test_image_processing_yolos.py
index 1039e4c91b..558c3f7391 100644
--- a/tests/models/yolos/test_image_processing_yolos.py
+++ b/tests/models/yolos/test_image_processing_yolos.py
@@ -86,18 +86,28 @@ class YolosImageProcessingTester(unittest.TestCase):
         if not batched:
             image = image_inputs[0]
             if isinstance(image, Image.Image):
-                w, h = image.size
+                width, height = image.size
             else:
-                h, w = image.shape[1], image.shape[2]
-            if w < h:
-                expected_height = int(self.size["shortest_edge"] * h / w)
-                expected_width = self.size["shortest_edge"]
-            elif w > h:
-                expected_height = self.size["shortest_edge"]
-                expected_width = int(self.size["shortest_edge"] * w / h)
-            else:
-                expected_height = self.size["shortest_edge"]
-                expected_width = self.size["shortest_edge"]
+                height, width = image.shape[1], image.shape[2]
+
+            size = self.size["shortest_edge"]
+            max_size = self.size.get("longest_edge", None)
+            if max_size is not None:
+                min_original_size = float(min((height, width)))
+                max_original_size = float(max((height, width)))
+                if max_original_size / min_original_size * size > max_size:
+                    size = int(round(max_size * min_original_size / max_original_size))
+
+            if width < height and width != size:
+                height = int(size * height / width)
+                width = size
+            elif height < width and height != size:
+                width = int(size * width / height)
+                height = size
+            width_mod = width % 16
+            height_mod = height % 16
+            expected_width = width - width_mod
+            expected_height = height - height_mod
 
         else:
             expected_values = []
@@ -173,6 +183,18 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
             torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
         )
 
+    def test_resize_max_size_respected(self):
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+
+        # create torch tensors as image
+        image = torch.randint(0, 256, (3, 100, 1500), dtype=torch.uint8)
+        processed_image = image_processor(
+            image, size={"longest_edge": 1333, "shortest_edge": 800}, do_pad=False, return_tensors="pt"
+        )["pixel_values"]
+
+        self.assertTrue(processed_image.shape[-1] <= 1333)
+        self.assertTrue(processed_image.shape[-2] <= 800)
+
     @slow
     def test_call_pytorch_with_coco_detection_annotations(self):
         # prepare image and target