From 874ac129bbc7e9aa478d0cce7d925af8cfb8425e Mon Sep 17 00:00:00 2001 From: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com> Date: Tue, 4 Jun 2024 00:12:08 +0900 Subject: [PATCH] fix the get_size_with_aspect_ratio in max_size situation (#30902) * fix the get_size_with_aspect_ratio in max_size situation * make fix-up * add more general solution * consider when max_size is not defined * fix typo * fix typo * simple fix * fix error * fix if else error * fix error of size overwrite * fix yolos image processing * fix detr image processing * make * add longest related test script * Update src/transformers/models/yolos/image_processing_yolos.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * add more test * add test script about longest size * remove deprecated --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- .../image_processing_conditional_detr.py | 20 ++++--- .../image_processing_deformable_detr.py | 20 ++++--- .../deprecated/deta/image_processing_deta.py | 20 ++++--- .../models/detr/image_processing_detr.py | 20 ++++--- .../image_processing_grounding_dino.py | 20 ++++--- .../models/yolos/image_processing_yolos.py | 45 +++++++++++----- .../test_image_processing_conditional_detr.py | 52 +++++++++++++++++++ .../test_image_processing_deformable_detr.py | 52 +++++++++++++++++++ .../models/detr/test_image_processing_detr.py | 52 +++++++++++++++++++ .../test_image_processing_grounding_dino.py | 52 +++++++++++++++++++ 10 files changed, 309 insertions(+), 44 deletions(-) diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py index 46a96a76cf..c7bc27207b 100644 --- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py +++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py @@ -100,21 +100,29 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in The maximum allowed output size. """ height, width = image_size + raw_size = None if max_size is not None: min_original_size = float(min((height, width))) max_original_size = float(max((height, width))) if max_original_size / min_original_size * size > max_size: - size = int(round(max_size * min_original_size / max_original_size)) + raw_size = max_size * min_original_size / max_original_size + size = int(round(raw_size)) if (height <= width and height == size) or (width <= height and width == size): - return height, width - - if width < height: + oh, ow = height, width + elif width < height: ow = size - oh = int(size * height / width) + if max_size is not None and raw_size is not None: + oh = int(raw_size * height / width) + else: + oh = int(size * height / width) else: oh = size - ow = int(size * width / height) + if max_size is not None and raw_size is not None: + ow = int(raw_size * width / height) + else: + ow = int(size * width / height) + return (oh, ow) diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py index f1ce6797e8..8c149f5549 100644 --- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py +++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py @@ -98,21 +98,29 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in The maximum allowed output size. """ height, width = image_size + raw_size = None if max_size is not None: min_original_size = float(min((height, width))) max_original_size = float(max((height, width))) if max_original_size / min_original_size * size > max_size: - size = int(round(max_size * min_original_size / max_original_size)) + raw_size = max_size * min_original_size / max_original_size + size = int(round(raw_size)) if (height <= width and height == size) or (width <= height and width == size): - return height, width - - if width < height: + oh, ow = height, width + elif width < height: ow = size - oh = int(size * height / width) + if max_size is not None and raw_size is not None: + oh = int(raw_size * height / width) + else: + oh = int(size * height / width) else: oh = size - ow = int(size * width / height) + if max_size is not None and raw_size is not None: + ow = int(raw_size * width / height) + else: + ow = int(size * width / height) + return (oh, ow) diff --git a/src/transformers/models/deprecated/deta/image_processing_deta.py b/src/transformers/models/deprecated/deta/image_processing_deta.py index 9e660198a2..a548590ce1 100644 --- a/src/transformers/models/deprecated/deta/image_processing_deta.py +++ b/src/transformers/models/deprecated/deta/image_processing_deta.py @@ -91,21 +91,29 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in The maximum allowed output size. """ height, width = image_size + raw_size = None if max_size is not None: min_original_size = float(min((height, width))) max_original_size = float(max((height, width))) if max_original_size / min_original_size * size > max_size: - size = int(round(max_size * min_original_size / max_original_size)) + raw_size = max_size * min_original_size / max_original_size + size = int(round(raw_size)) if (height <= width and height == size) or (width <= height and width == size): - return height, width - - if width < height: + oh, ow = height, width + elif width < height: ow = size - oh = int(size * height / width) + if max_size is not None and raw_size is not None: + oh = int(raw_size * height / width) + else: + oh = int(size * height / width) else: oh = size - ow = int(size * width / height) + if max_size is not None and raw_size is not None: + ow = int(raw_size * width / height) + else: + ow = int(size * width / height) + return (oh, ow) diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py index e6c2ee16a8..10d1b4d5d4 100644 --- a/src/transformers/models/detr/image_processing_detr.py +++ b/src/transformers/models/detr/image_processing_detr.py @@ -98,21 +98,29 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in The maximum allowed output size. """ height, width = image_size + raw_size = None if max_size is not None: min_original_size = float(min((height, width))) max_original_size = float(max((height, width))) if max_original_size / min_original_size * size > max_size: - size = int(round(max_size * min_original_size / max_original_size)) + raw_size = max_size * min_original_size / max_original_size + size = int(round(raw_size)) if (height <= width and height == size) or (width <= height and width == size): - return height, width - - if width < height: + oh, ow = height, width + elif width < height: ow = size - oh = int(size * height / width) + if max_size is not None and raw_size is not None: + oh = int(raw_size * height / width) + else: + oh = int(size * height / width) else: oh = size - ow = int(size * width / height) + if max_size is not None and raw_size is not None: + ow = int(raw_size * width / height) + else: + ow = int(size * width / height) + return (oh, ow) diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py index 08a5a70bf4..569e22ba47 100644 --- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py @@ -105,21 +105,29 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in The maximum allowed output size. """ height, width = image_size + raw_size = None if max_size is not None: min_original_size = float(min((height, width))) max_original_size = float(max((height, width))) if max_original_size / min_original_size * size > max_size: - size = int(round(max_size * min_original_size / max_original_size)) + raw_size = max_size * min_original_size / max_original_size + size = int(round(raw_size)) if (height <= width and height == size) or (width <= height and width == size): - return height, width - - if width < height: + oh, ow = height, width + elif width < height: ow = size - oh = int(size * height / width) + if max_size is not None and raw_size is not None: + oh = int(raw_size * height / width) + else: + oh = int(size * height / width) else: oh = size - ow = int(size * width / height) + if max_size is not None and raw_size is not None: + ow = int(raw_size * width / height) + else: + ow = int(size * width / height) + return (oh, ow) diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py index 669dab238e..19b21333f6 100644 --- a/src/transformers/models/yolos/image_processing_yolos.py +++ b/src/transformers/models/yolos/image_processing_yolos.py @@ -101,9 +101,11 @@ def get_max_height_width( return (max_height, max_width) -def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]: +def get_size_with_aspect_ratio( + image_size: Tuple[int, int], size: int, max_size: Optional[int] = None, mod_size: int = 16 +) -> Tuple[int, int]: """ - Computes the output image size given the input image size and the desired output size. + Computes the output image size given the input image size and the desired output size with multiple of divisible_size. Args: image_size (`Tuple[int, int]`): @@ -112,25 +114,40 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in The desired output size. max_size (`int`, *optional*): The maximum allowed output size. + mod_size (`int`, *optional*): + The size to make multiple of mod_size. """ height, width = image_size + raw_size = None if max_size is not None: min_original_size = float(min((height, width))) max_original_size = float(max((height, width))) if max_original_size / min_original_size * size > max_size: - size = int(round(max_size * min_original_size / max_original_size)) + raw_size = max_size * min_original_size / max_original_size + size = int(round(raw_size)) - if width <= height and width != size: - height = int(size * height / width) - width = size - elif height < width and height != size: - width = int(size * width / height) - height = size - width_mod = np.mod(width, 16) - height_mod = np.mod(height, 16) - width = width - width_mod - height = height - height_mod - return (height, width) + if width < height: + ow = size + if max_size is not None and raw_size is not None: + oh = int(raw_size * height / width) + else: + oh = int(size * height / width) + elif (height <= width and height == size) or (width <= height and width == size): + oh, ow = height, width + else: + oh = size + if max_size is not None and raw_size is not None: + ow = int(raw_size * width / height) + else: + ow = int(size * width / height) + + if mod_size is not None: + ow_mod = np.mod(ow, mod_size) + oh_mod = np.mod(oh, mod_size) + ow = ow - ow_mod + oh = oh - oh_mod + + return (oh, ow) # Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width diff --git a/tests/models/conditional_detr/test_image_processing_conditional_detr.py b/tests/models/conditional_detr/test_image_processing_conditional_detr.py index 61dcdc873d..7bbee7e831 100644 --- a/tests/models/conditional_detr/test_image_processing_conditional_detr.py +++ b/tests/models/conditional_detr/test_image_processing_conditional_detr.py @@ -537,3 +537,55 @@ class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcess ) inputs = image_processor(images=[image_1, image_2], return_tensors="pt") self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) + + def test_longest_edge_shortest_edge_resizing_strategy(self): + image_1 = torch.ones([958, 653, 3], dtype=torch.uint8) + + # max size is set; width < height; + # do_pad=False, longest_edge=640, shortest_edge=640, image=958x653 -> 640x436 + image_processor = ConditionalDetrImageProcessor( + size={"longest_edge": 640, "shortest_edge": 640}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 640, 436])) + + image_2 = torch.ones([653, 958, 3], dtype=torch.uint8) + # max size is set; height < width; + # do_pad=False, longest_edge=640, shortest_edge=640, image=653x958 -> 436x640 + image_processor = ConditionalDetrImageProcessor( + size={"longest_edge": 640, "shortest_edge": 640}, + do_pad=False, + ) + inputs = image_processor(images=[image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 436, 640])) + + image_3 = torch.ones([100, 120, 3], dtype=torch.uint8) + # max size is set; width == size; height > max_size; + # do_pad=False, longest_edge=118, shortest_edge=100, image=120x100 -> 118x98 + image_processor = ConditionalDetrImageProcessor( + size={"longest_edge": 118, "shortest_edge": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_3], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 98, 118])) + + image_4 = torch.ones([128, 50, 3], dtype=torch.uint8) + # max size is set; height == size; width < max_size; + # do_pad=False, longest_edge=256, shortest_edge=50, image=50x128 -> 50x128 + image_processor = ConditionalDetrImageProcessor( + size={"longest_edge": 256, "shortest_edge": 50}, + do_pad=False, + ) + inputs = image_processor(images=[image_4], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 128, 50])) + + image_5 = torch.ones([50, 50, 3], dtype=torch.uint8) + # max size is set; height == width; width < max_size; + # do_pad=False, longest_edge=117, shortest_edge=50, image=50x50 -> 50x50 + image_processor = ConditionalDetrImageProcessor( + size={"longest_edge": 117, "shortest_edge": 50}, + do_pad=False, + ) + inputs = image_processor(images=[image_5], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50])) diff --git a/tests/models/deformable_detr/test_image_processing_deformable_detr.py b/tests/models/deformable_detr/test_image_processing_deformable_detr.py index 49139c7539..59ba5b59e3 100644 --- a/tests/models/deformable_detr/test_image_processing_deformable_detr.py +++ b/tests/models/deformable_detr/test_image_processing_deformable_detr.py @@ -539,3 +539,55 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi ) inputs = image_processor(images=[image_1, image_2], return_tensors="pt") self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) + + def test_longest_edge_shortest_edge_resizing_strategy(self): + image_1 = torch.ones([958, 653, 3], dtype=torch.uint8) + + # max size is set; width < height; + # do_pad=False, longest_edge=640, shortest_edge=640, image=958x653 -> 640x436 + image_processor = DeformableDetrImageProcessor( + size={"longest_edge": 640, "shortest_edge": 640}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 640, 436])) + + image_2 = torch.ones([653, 958, 3], dtype=torch.uint8) + # max size is set; height < width; + # do_pad=False, longest_edge=640, shortest_edge=640, image=653x958 -> 436x640 + image_processor = DeformableDetrImageProcessor( + size={"longest_edge": 640, "shortest_edge": 640}, + do_pad=False, + ) + inputs = image_processor(images=[image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 436, 640])) + + image_3 = torch.ones([100, 120, 3], dtype=torch.uint8) + # max size is set; width == size; height > max_size; + # do_pad=False, longest_edge=118, shortest_edge=100, image=120x100 -> 118x98 + image_processor = DeformableDetrImageProcessor( + size={"longest_edge": 118, "shortest_edge": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_3], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 98, 118])) + + image_4 = torch.ones([128, 50, 3], dtype=torch.uint8) + # max size is set; height == size; width < max_size; + # do_pad=False, longest_edge=256, shortest_edge=50, image=50x128 -> 50x128 + image_processor = DeformableDetrImageProcessor( + size={"longest_edge": 256, "shortest_edge": 50}, + do_pad=False, + ) + inputs = image_processor(images=[image_4], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 128, 50])) + + image_5 = torch.ones([50, 50, 3], dtype=torch.uint8) + # max size is set; height == width; width < max_size; + # do_pad=False, longest_edge=117, shortest_edge=50, image=50x50 -> 50x50 + image_processor = DeformableDetrImageProcessor( + size={"longest_edge": 117, "shortest_edge": 50}, + do_pad=False, + ) + inputs = image_processor(images=[image_5], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50])) diff --git a/tests/models/detr/test_image_processing_detr.py b/tests/models/detr/test_image_processing_detr.py index ede06be6c5..7f9f18b9d4 100644 --- a/tests/models/detr/test_image_processing_detr.py +++ b/tests/models/detr/test_image_processing_detr.py @@ -593,3 +593,55 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi ) inputs = image_processor(images=[image_1, image_2], return_tensors="pt") self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) + + def test_longest_edge_shortest_edge_resizing_strategy(self): + image_1 = torch.ones([958, 653, 3], dtype=torch.uint8) + + # max size is set; width < height; + # do_pad=False, longest_edge=640, shortest_edge=640, image=958x653 -> 640x436 + image_processor = DetrImageProcessor( + size={"longest_edge": 640, "shortest_edge": 640}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 640, 436])) + + image_2 = torch.ones([653, 958, 3], dtype=torch.uint8) + # max size is set; height < width; + # do_pad=False, longest_edge=640, shortest_edge=640, image=653x958 -> 436x640 + image_processor = DetrImageProcessor( + size={"longest_edge": 640, "shortest_edge": 640}, + do_pad=False, + ) + inputs = image_processor(images=[image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 436, 640])) + + image_3 = torch.ones([100, 120, 3], dtype=torch.uint8) + # max size is set; width == size; height > max_size; + # do_pad=False, longest_edge=118, shortest_edge=100, image=120x100 -> 118x98 + image_processor = DetrImageProcessor( + size={"longest_edge": 118, "shortest_edge": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_3], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 98, 118])) + + image_4 = torch.ones([128, 50, 3], dtype=torch.uint8) + # max size is set; height == size; width < max_size; + # do_pad=False, longest_edge=256, shortest_edge=50, image=50x128 -> 50x128 + image_processor = DetrImageProcessor( + size={"longest_edge": 256, "shortest_edge": 50}, + do_pad=False, + ) + inputs = image_processor(images=[image_4], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 128, 50])) + + image_5 = torch.ones([50, 50, 3], dtype=torch.uint8) + # max size is set; height == width; width < max_size; + # do_pad=False, longest_edge=117, shortest_edge=50, image=50x50 -> 50x50 + image_processor = DetrImageProcessor( + size={"longest_edge": 117, "shortest_edge": 50}, + do_pad=False, + ) + inputs = image_processor(images=[image_5], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50])) diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py index 5cd09ce238..6d20a01981 100644 --- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py +++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py @@ -575,3 +575,55 @@ class GroundingDinoImageProcessingTest(AnnotationFormatTestMixin, ImageProcessin ) inputs = image_processor(images=[image_1, image_2], return_tensors="pt") self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) + + def test_longest_edge_shortest_edge_resizing_strategy(self): + image_1 = torch.ones([958, 653, 3], dtype=torch.uint8) + + # max size is set; width < height; + # do_pad=False, longest_edge=640, shortest_edge=640, image=958x653 -> 640x436 + image_processor = GroundingDinoImageProcessor( + size={"longest_edge": 640, "shortest_edge": 640}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 640, 436])) + + image_2 = torch.ones([653, 958, 3], dtype=torch.uint8) + # max size is set; height < width; + # do_pad=False, longest_edge=640, shortest_edge=640, image=653x958 -> 436x640 + image_processor = GroundingDinoImageProcessor( + size={"longest_edge": 640, "shortest_edge": 640}, + do_pad=False, + ) + inputs = image_processor(images=[image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 436, 640])) + + image_3 = torch.ones([100, 120, 3], dtype=torch.uint8) + # max size is set; width == size; height > max_size; + # do_pad=False, longest_edge=118, shortest_edge=100, image=120x100 -> 118x98 + image_processor = GroundingDinoImageProcessor( + size={"longest_edge": 118, "shortest_edge": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_3], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 98, 118])) + + image_4 = torch.ones([128, 50, 3], dtype=torch.uint8) + # max size is set; height == size; width < max_size; + # do_pad=False, longest_edge=256, shortest_edge=50, image=50x128 -> 50x128 + image_processor = GroundingDinoImageProcessor( + size={"longest_edge": 256, "shortest_edge": 50}, + do_pad=False, + ) + inputs = image_processor(images=[image_4], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 128, 50])) + + image_5 = torch.ones([50, 50, 3], dtype=torch.uint8) + # max size is set; height == width; width < max_size; + # do_pad=False, longest_edge=117, shortest_edge=50, image=50x50 -> 50x50 + image_processor = GroundingDinoImageProcessor( + size={"longest_edge": 117, "shortest_edge": 50}, + do_pad=False, + ) + inputs = image_processor(images=[image_5], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50]))