From 874ac129bbc7e9aa478d0cce7d925af8cfb8425e Mon Sep 17 00:00:00 2001
From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com>
Date: Tue, 4 Jun 2024 00:12:08 +0900
Subject: [PATCH] fix the get_size_with_aspect_ratio in max_size situation
 (#30902)

* fix the get_size_with_aspect_ratio in max_size situation

* make fix-up

* add more general solution

* consider when max_size is not defined

* fix typo

* fix typo

* simple fix

* fix error

* fix if else error

* fix error of size overwrite

* fix yolos image processing

* fix detr image processing

* make

* add longest related test script

* Update src/transformers/models/yolos/image_processing_yolos.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* add more test

* add test script about longest size

* remove deprecated

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../image_processing_conditional_detr.py      | 20 ++++---
 .../image_processing_deformable_detr.py       | 20 ++++---
 .../deprecated/deta/image_processing_deta.py  | 20 ++++---
 .../models/detr/image_processing_detr.py      | 20 ++++---
 .../image_processing_grounding_dino.py        | 20 ++++---
 .../models/yolos/image_processing_yolos.py    | 45 +++++++++++-----
 .../test_image_processing_conditional_detr.py | 52 +++++++++++++++++++
 .../test_image_processing_deformable_detr.py  | 52 +++++++++++++++++++
 .../models/detr/test_image_processing_detr.py | 52 +++++++++++++++++++
 .../test_image_processing_grounding_dino.py   | 52 +++++++++++++++++++
 10 files changed, 309 insertions(+), 44 deletions(-)

diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
index 46a96a76cf..c7bc27207b 100644
--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
@@ -100,21 +100,29 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
             The maximum allowed output size.
     """
     height, width = image_size
+    raw_size = None
     if max_size is not None:
         min_original_size = float(min((height, width)))
         max_original_size = float(max((height, width)))
         if max_original_size / min_original_size * size > max_size:
-            size = int(round(max_size * min_original_size / max_original_size))
+            raw_size = max_size * min_original_size / max_original_size
+            size = int(round(raw_size))
 
     if (height <= width and height == size) or (width <= height and width == size):
-        return height, width
-
-    if width < height:
+        oh, ow = height, width
+    elif width < height:
         ow = size
-        oh = int(size * height / width)
+        if max_size is not None and raw_size is not None:
+            oh = int(raw_size * height / width)
+        else:
+            oh = int(size * height / width)
     else:
         oh = size
-        ow = int(size * width / height)
+        if max_size is not None and raw_size is not None:
+            ow = int(raw_size * width / height)
+        else:
+            ow = int(size * width / height)
+
     return (oh, ow)
 
 
diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
index f1ce6797e8..8c149f5549 100644
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
@@ -98,21 +98,29 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
             The maximum allowed output size.
     """
     height, width = image_size
+    raw_size = None
     if max_size is not None:
         min_original_size = float(min((height, width)))
         max_original_size = float(max((height, width)))
         if max_original_size / min_original_size * size > max_size:
-            size = int(round(max_size * min_original_size / max_original_size))
+            raw_size = max_size * min_original_size / max_original_size
+            size = int(round(raw_size))
 
     if (height <= width and height == size) or (width <= height and width == size):
-        return height, width
-
-    if width < height:
+        oh, ow = height, width
+    elif width < height:
         ow = size
-        oh = int(size * height / width)
+        if max_size is not None and raw_size is not None:
+            oh = int(raw_size * height / width)
+        else:
+            oh = int(size * height / width)
     else:
         oh = size
-        ow = int(size * width / height)
+        if max_size is not None and raw_size is not None:
+            ow = int(raw_size * width / height)
+        else:
+            ow = int(size * width / height)
+
     return (oh, ow)
 
 
diff --git a/src/transformers/models/deprecated/deta/image_processing_deta.py b/src/transformers/models/deprecated/deta/image_processing_deta.py
index 9e660198a2..a548590ce1 100644
--- a/src/transformers/models/deprecated/deta/image_processing_deta.py
+++ b/src/transformers/models/deprecated/deta/image_processing_deta.py
@@ -91,21 +91,29 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
             The maximum allowed output size.
     """
     height, width = image_size
+    raw_size = None
     if max_size is not None:
         min_original_size = float(min((height, width)))
         max_original_size = float(max((height, width)))
         if max_original_size / min_original_size * size > max_size:
-            size = int(round(max_size * min_original_size / max_original_size))
+            raw_size = max_size * min_original_size / max_original_size
+            size = int(round(raw_size))
 
     if (height <= width and height == size) or (width <= height and width == size):
-        return height, width
-
-    if width < height:
+        oh, ow = height, width
+    elif width < height:
         ow = size
-        oh = int(size * height / width)
+        if max_size is not None and raw_size is not None:
+            oh = int(raw_size * height / width)
+        else:
+            oh = int(size * height / width)
     else:
         oh = size
-        ow = int(size * width / height)
+        if max_size is not None and raw_size is not None:
+            ow = int(raw_size * width / height)
+        else:
+            ow = int(size * width / height)
+
     return (oh, ow)
 
 
diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py
index e6c2ee16a8..10d1b4d5d4 100644
--- a/src/transformers/models/detr/image_processing_detr.py
+++ b/src/transformers/models/detr/image_processing_detr.py
@@ -98,21 +98,29 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
             The maximum allowed output size.
     """
     height, width = image_size
+    raw_size = None
     if max_size is not None:
         min_original_size = float(min((height, width)))
         max_original_size = float(max((height, width)))
         if max_original_size / min_original_size * size > max_size:
-            size = int(round(max_size * min_original_size / max_original_size))
+            raw_size = max_size * min_original_size / max_original_size
+            size = int(round(raw_size))
 
     if (height <= width and height == size) or (width <= height and width == size):
-        return height, width
-
-    if width < height:
+        oh, ow = height, width
+    elif width < height:
         ow = size
-        oh = int(size * height / width)
+        if max_size is not None and raw_size is not None:
+            oh = int(raw_size * height / width)
+        else:
+            oh = int(size * height / width)
     else:
         oh = size
-        ow = int(size * width / height)
+        if max_size is not None and raw_size is not None:
+            ow = int(raw_size * width / height)
+        else:
+            ow = int(size * width / height)
+
     return (oh, ow)
 
 
diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
index 08a5a70bf4..569e22ba47 100644
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
@@ -105,21 +105,29 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
             The maximum allowed output size.
     """
     height, width = image_size
+    raw_size = None
     if max_size is not None:
         min_original_size = float(min((height, width)))
         max_original_size = float(max((height, width)))
         if max_original_size / min_original_size * size > max_size:
-            size = int(round(max_size * min_original_size / max_original_size))
+            raw_size = max_size * min_original_size / max_original_size
+            size = int(round(raw_size))
 
     if (height <= width and height == size) or (width <= height and width == size):
-        return height, width
-
-    if width < height:
+        oh, ow = height, width
+    elif width < height:
         ow = size
-        oh = int(size * height / width)
+        if max_size is not None and raw_size is not None:
+            oh = int(raw_size * height / width)
+        else:
+            oh = int(size * height / width)
     else:
         oh = size
-        ow = int(size * width / height)
+        if max_size is not None and raw_size is not None:
+            ow = int(raw_size * width / height)
+        else:
+            ow = int(size * width / height)
+
     return (oh, ow)
 
 
diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py
index 669dab238e..19b21333f6 100644
--- a/src/transformers/models/yolos/image_processing_yolos.py
+++ b/src/transformers/models/yolos/image_processing_yolos.py
@@ -101,9 +101,11 @@ def get_max_height_width(
     return (max_height, max_width)
 
 
-def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]:
+def get_size_with_aspect_ratio(
+    image_size: Tuple[int, int], size: int, max_size: Optional[int] = None, mod_size: int = 16
+) -> Tuple[int, int]:
     """
-    Computes the output image size given the input image size and the desired output size.
+    Computes the output image size given the input image size and the desired output size with multiple of divisible_size.
 
     Args:
         image_size (`Tuple[int, int]`):
@@ -112,25 +114,40 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
             The desired output size.
         max_size (`int`, *optional*):
             The maximum allowed output size.
+        mod_size (`int`, *optional*):
+            The size to make multiple of mod_size.
     """
     height, width = image_size
+    raw_size = None
     if max_size is not None:
         min_original_size = float(min((height, width)))
         max_original_size = float(max((height, width)))
         if max_original_size / min_original_size * size > max_size:
-            size = int(round(max_size * min_original_size / max_original_size))
+            raw_size = max_size * min_original_size / max_original_size
+            size = int(round(raw_size))
 
-    if width <= height and width != size:
-        height = int(size * height / width)
-        width = size
-    elif height < width and height != size:
-        width = int(size * width / height)
-        height = size
-    width_mod = np.mod(width, 16)
-    height_mod = np.mod(height, 16)
-    width = width - width_mod
-    height = height - height_mod
-    return (height, width)
+    if width < height:
+        ow = size
+        if max_size is not None and raw_size is not None:
+            oh = int(raw_size * height / width)
+        else:
+            oh = int(size * height / width)
+    elif (height <= width and height == size) or (width <= height and width == size):
+        oh, ow = height, width
+    else:
+        oh = size
+        if max_size is not None and raw_size is not None:
+            ow = int(raw_size * width / height)
+        else:
+            ow = int(size * width / height)
+
+    if mod_size is not None:
+        ow_mod = np.mod(ow, mod_size)
+        oh_mod = np.mod(oh, mod_size)
+        ow = ow - ow_mod
+        oh = oh - oh_mod
+
+    return (oh, ow)
 
 
 # Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width
diff --git a/tests/models/conditional_detr/test_image_processing_conditional_detr.py b/tests/models/conditional_detr/test_image_processing_conditional_detr.py
index 61dcdc873d..7bbee7e831 100644
--- a/tests/models/conditional_detr/test_image_processing_conditional_detr.py
+++ b/tests/models/conditional_detr/test_image_processing_conditional_detr.py
@@ -537,3 +537,55 @@ class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcess
         )
         inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
         self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
+
+    def test_longest_edge_shortest_edge_resizing_strategy(self):
+        image_1 = torch.ones([958, 653, 3], dtype=torch.uint8)
+
+        # max size is set; width < height;
+        # do_pad=False, longest_edge=640, shortest_edge=640, image=958x653 -> 640x436
+        image_processor = ConditionalDetrImageProcessor(
+            size={"longest_edge": 640, "shortest_edge": 640},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 640, 436]))
+
+        image_2 = torch.ones([653, 958, 3], dtype=torch.uint8)
+        # max size is set; height < width;
+        # do_pad=False, longest_edge=640, shortest_edge=640, image=653x958 -> 436x640
+        image_processor = ConditionalDetrImageProcessor(
+            size={"longest_edge": 640, "shortest_edge": 640},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_2], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 436, 640]))
+
+        image_3 = torch.ones([100, 120, 3], dtype=torch.uint8)
+        # max size is set; width == size; height > max_size;
+        # do_pad=False, longest_edge=118, shortest_edge=100, image=120x100 -> 118x98
+        image_processor = ConditionalDetrImageProcessor(
+            size={"longest_edge": 118, "shortest_edge": 100},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_3], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 98, 118]))
+
+        image_4 = torch.ones([128, 50, 3], dtype=torch.uint8)
+        # max size is set; height == size; width < max_size;
+        # do_pad=False, longest_edge=256, shortest_edge=50, image=50x128 -> 50x128
+        image_processor = ConditionalDetrImageProcessor(
+            size={"longest_edge": 256, "shortest_edge": 50},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_4], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 128, 50]))
+
+        image_5 = torch.ones([50, 50, 3], dtype=torch.uint8)
+        # max size is set; height == width; width < max_size;
+        # do_pad=False, longest_edge=117, shortest_edge=50, image=50x50 -> 50x50
+        image_processor = ConditionalDetrImageProcessor(
+            size={"longest_edge": 117, "shortest_edge": 50},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_5], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50]))
diff --git a/tests/models/deformable_detr/test_image_processing_deformable_detr.py b/tests/models/deformable_detr/test_image_processing_deformable_detr.py
index 49139c7539..59ba5b59e3 100644
--- a/tests/models/deformable_detr/test_image_processing_deformable_detr.py
+++ b/tests/models/deformable_detr/test_image_processing_deformable_detr.py
@@ -539,3 +539,55 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
         )
         inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
         self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
+
+    def test_longest_edge_shortest_edge_resizing_strategy(self):
+        image_1 = torch.ones([958, 653, 3], dtype=torch.uint8)
+
+        # max size is set; width < height;
+        # do_pad=False, longest_edge=640, shortest_edge=640, image=958x653 -> 640x436
+        image_processor = DeformableDetrImageProcessor(
+            size={"longest_edge": 640, "shortest_edge": 640},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 640, 436]))
+
+        image_2 = torch.ones([653, 958, 3], dtype=torch.uint8)
+        # max size is set; height < width;
+        # do_pad=False, longest_edge=640, shortest_edge=640, image=653x958 -> 436x640
+        image_processor = DeformableDetrImageProcessor(
+            size={"longest_edge": 640, "shortest_edge": 640},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_2], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 436, 640]))
+
+        image_3 = torch.ones([100, 120, 3], dtype=torch.uint8)
+        # max size is set; width == size; height > max_size;
+        # do_pad=False, longest_edge=118, shortest_edge=100, image=120x100 -> 118x98
+        image_processor = DeformableDetrImageProcessor(
+            size={"longest_edge": 118, "shortest_edge": 100},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_3], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 98, 118]))
+
+        image_4 = torch.ones([128, 50, 3], dtype=torch.uint8)
+        # max size is set; height == size; width < max_size;
+        # do_pad=False, longest_edge=256, shortest_edge=50, image=50x128 -> 50x128
+        image_processor = DeformableDetrImageProcessor(
+            size={"longest_edge": 256, "shortest_edge": 50},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_4], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 128, 50]))
+
+        image_5 = torch.ones([50, 50, 3], dtype=torch.uint8)
+        # max size is set; height == width; width < max_size;
+        # do_pad=False, longest_edge=117, shortest_edge=50, image=50x50 -> 50x50
+        image_processor = DeformableDetrImageProcessor(
+            size={"longest_edge": 117, "shortest_edge": 50},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_5], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50]))
diff --git a/tests/models/detr/test_image_processing_detr.py b/tests/models/detr/test_image_processing_detr.py
index ede06be6c5..7f9f18b9d4 100644
--- a/tests/models/detr/test_image_processing_detr.py
+++ b/tests/models/detr/test_image_processing_detr.py
@@ -593,3 +593,55 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
         )
         inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
         self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
+
+    def test_longest_edge_shortest_edge_resizing_strategy(self):
+        image_1 = torch.ones([958, 653, 3], dtype=torch.uint8)
+
+        # max size is set; width < height;
+        # do_pad=False, longest_edge=640, shortest_edge=640, image=958x653 -> 640x436
+        image_processor = DetrImageProcessor(
+            size={"longest_edge": 640, "shortest_edge": 640},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 640, 436]))
+
+        image_2 = torch.ones([653, 958, 3], dtype=torch.uint8)
+        # max size is set; height < width;
+        # do_pad=False, longest_edge=640, shortest_edge=640, image=653x958 -> 436x640
+        image_processor = DetrImageProcessor(
+            size={"longest_edge": 640, "shortest_edge": 640},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_2], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 436, 640]))
+
+        image_3 = torch.ones([100, 120, 3], dtype=torch.uint8)
+        # max size is set; width == size; height > max_size;
+        # do_pad=False, longest_edge=118, shortest_edge=100, image=120x100 -> 118x98
+        image_processor = DetrImageProcessor(
+            size={"longest_edge": 118, "shortest_edge": 100},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_3], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 98, 118]))
+
+        image_4 = torch.ones([128, 50, 3], dtype=torch.uint8)
+        # max size is set; height == size; width < max_size;
+        # do_pad=False, longest_edge=256, shortest_edge=50, image=50x128 -> 50x128
+        image_processor = DetrImageProcessor(
+            size={"longest_edge": 256, "shortest_edge": 50},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_4], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 128, 50]))
+
+        image_5 = torch.ones([50, 50, 3], dtype=torch.uint8)
+        # max size is set; height == width; width < max_size;
+        # do_pad=False, longest_edge=117, shortest_edge=50, image=50x50 -> 50x50
+        image_processor = DetrImageProcessor(
+            size={"longest_edge": 117, "shortest_edge": 50},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_5], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50]))
diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
index 5cd09ce238..6d20a01981 100644
--- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py
+++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
@@ -575,3 +575,55 @@ class GroundingDinoImageProcessingTest(AnnotationFormatTestMixin, ImageProcessin
         )
         inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
         self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
+
+    def test_longest_edge_shortest_edge_resizing_strategy(self):
+        image_1 = torch.ones([958, 653, 3], dtype=torch.uint8)
+
+        # max size is set; width < height;
+        # do_pad=False, longest_edge=640, shortest_edge=640, image=958x653 -> 640x436
+        image_processor = GroundingDinoImageProcessor(
+            size={"longest_edge": 640, "shortest_edge": 640},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 640, 436]))
+
+        image_2 = torch.ones([653, 958, 3], dtype=torch.uint8)
+        # max size is set; height < width;
+        # do_pad=False, longest_edge=640, shortest_edge=640, image=653x958 -> 436x640
+        image_processor = GroundingDinoImageProcessor(
+            size={"longest_edge": 640, "shortest_edge": 640},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_2], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 436, 640]))
+
+        image_3 = torch.ones([100, 120, 3], dtype=torch.uint8)
+        # max size is set; width == size; height > max_size;
+        # do_pad=False, longest_edge=118, shortest_edge=100, image=120x100 -> 118x98
+        image_processor = GroundingDinoImageProcessor(
+            size={"longest_edge": 118, "shortest_edge": 100},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_3], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 98, 118]))
+
+        image_4 = torch.ones([128, 50, 3], dtype=torch.uint8)
+        # max size is set; height == size; width < max_size;
+        # do_pad=False, longest_edge=256, shortest_edge=50, image=50x128 -> 50x128
+        image_processor = GroundingDinoImageProcessor(
+            size={"longest_edge": 256, "shortest_edge": 50},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_4], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 128, 50]))
+
+        image_5 = torch.ones([50, 50, 3], dtype=torch.uint8)
+        # max size is set; height == width; width < max_size;
+        # do_pad=False, longest_edge=117, shortest_edge=50, image=50x50 -> 50x50
+        image_processor = GroundingDinoImageProcessor(
+            size={"longest_edge": 117, "shortest_edge": 50},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_5], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50]))