fix the get_size_with_aspect_ratio in max_size situation (#30902)

* fix the get_size_with_aspect_ratio in max_size situation

* make fix-up

* add more general solution

* consider when max_size is not defined

* fix typo

* fix typo

* simple fix

* fix error

* fix if else error

* fix error of size overwrite

* fix yolos image processing

* fix detr image processing

* make

* add longest related test script

* Update src/transformers/models/yolos/image_processing_yolos.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* add more test

* add test script about longest size

* remove deprecated

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
This commit is contained in:
Sangbum Daniel Choi
2024-06-04 00:12:08 +09:00
committed by GitHub
parent e4628434d8
commit 874ac129bb
10 changed files with 309 additions and 44 deletions

View File

@@ -100,21 +100,29 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
The maximum allowed output size. The maximum allowed output size.
""" """
height, width = image_size height, width = image_size
raw_size = None
if max_size is not None: if max_size is not None:
min_original_size = float(min((height, width))) min_original_size = float(min((height, width)))
max_original_size = float(max((height, width))) max_original_size = float(max((height, width)))
if max_original_size / min_original_size * size > max_size: if max_original_size / min_original_size * size > max_size:
size = int(round(max_size * min_original_size / max_original_size)) raw_size = max_size * min_original_size / max_original_size
size = int(round(raw_size))
if (height <= width and height == size) or (width <= height and width == size): if (height <= width and height == size) or (width <= height and width == size):
return height, width oh, ow = height, width
elif width < height:
if width < height:
ow = size ow = size
if max_size is not None and raw_size is not None:
oh = int(raw_size * height / width)
else:
oh = int(size * height / width) oh = int(size * height / width)
else: else:
oh = size oh = size
if max_size is not None and raw_size is not None:
ow = int(raw_size * width / height)
else:
ow = int(size * width / height) ow = int(size * width / height)
return (oh, ow) return (oh, ow)

View File

@@ -98,21 +98,29 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
The maximum allowed output size. The maximum allowed output size.
""" """
height, width = image_size height, width = image_size
raw_size = None
if max_size is not None: if max_size is not None:
min_original_size = float(min((height, width))) min_original_size = float(min((height, width)))
max_original_size = float(max((height, width))) max_original_size = float(max((height, width)))
if max_original_size / min_original_size * size > max_size: if max_original_size / min_original_size * size > max_size:
size = int(round(max_size * min_original_size / max_original_size)) raw_size = max_size * min_original_size / max_original_size
size = int(round(raw_size))
if (height <= width and height == size) or (width <= height and width == size): if (height <= width and height == size) or (width <= height and width == size):
return height, width oh, ow = height, width
elif width < height:
if width < height:
ow = size ow = size
if max_size is not None and raw_size is not None:
oh = int(raw_size * height / width)
else:
oh = int(size * height / width) oh = int(size * height / width)
else: else:
oh = size oh = size
if max_size is not None and raw_size is not None:
ow = int(raw_size * width / height)
else:
ow = int(size * width / height) ow = int(size * width / height)
return (oh, ow) return (oh, ow)

View File

@@ -91,21 +91,29 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
The maximum allowed output size. The maximum allowed output size.
""" """
height, width = image_size height, width = image_size
raw_size = None
if max_size is not None: if max_size is not None:
min_original_size = float(min((height, width))) min_original_size = float(min((height, width)))
max_original_size = float(max((height, width))) max_original_size = float(max((height, width)))
if max_original_size / min_original_size * size > max_size: if max_original_size / min_original_size * size > max_size:
size = int(round(max_size * min_original_size / max_original_size)) raw_size = max_size * min_original_size / max_original_size
size = int(round(raw_size))
if (height <= width and height == size) or (width <= height and width == size): if (height <= width and height == size) or (width <= height and width == size):
return height, width oh, ow = height, width
elif width < height:
if width < height:
ow = size ow = size
if max_size is not None and raw_size is not None:
oh = int(raw_size * height / width)
else:
oh = int(size * height / width) oh = int(size * height / width)
else: else:
oh = size oh = size
if max_size is not None and raw_size is not None:
ow = int(raw_size * width / height)
else:
ow = int(size * width / height) ow = int(size * width / height)
return (oh, ow) return (oh, ow)

View File

@@ -98,21 +98,29 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
The maximum allowed output size. The maximum allowed output size.
""" """
height, width = image_size height, width = image_size
raw_size = None
if max_size is not None: if max_size is not None:
min_original_size = float(min((height, width))) min_original_size = float(min((height, width)))
max_original_size = float(max((height, width))) max_original_size = float(max((height, width)))
if max_original_size / min_original_size * size > max_size: if max_original_size / min_original_size * size > max_size:
size = int(round(max_size * min_original_size / max_original_size)) raw_size = max_size * min_original_size / max_original_size
size = int(round(raw_size))
if (height <= width and height == size) or (width <= height and width == size): if (height <= width and height == size) or (width <= height and width == size):
return height, width oh, ow = height, width
elif width < height:
if width < height:
ow = size ow = size
if max_size is not None and raw_size is not None:
oh = int(raw_size * height / width)
else:
oh = int(size * height / width) oh = int(size * height / width)
else: else:
oh = size oh = size
if max_size is not None and raw_size is not None:
ow = int(raw_size * width / height)
else:
ow = int(size * width / height) ow = int(size * width / height)
return (oh, ow) return (oh, ow)

View File

@@ -105,21 +105,29 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
The maximum allowed output size. The maximum allowed output size.
""" """
height, width = image_size height, width = image_size
raw_size = None
if max_size is not None: if max_size is not None:
min_original_size = float(min((height, width))) min_original_size = float(min((height, width)))
max_original_size = float(max((height, width))) max_original_size = float(max((height, width)))
if max_original_size / min_original_size * size > max_size: if max_original_size / min_original_size * size > max_size:
size = int(round(max_size * min_original_size / max_original_size)) raw_size = max_size * min_original_size / max_original_size
size = int(round(raw_size))
if (height <= width and height == size) or (width <= height and width == size): if (height <= width and height == size) or (width <= height and width == size):
return height, width oh, ow = height, width
elif width < height:
if width < height:
ow = size ow = size
if max_size is not None and raw_size is not None:
oh = int(raw_size * height / width)
else:
oh = int(size * height / width) oh = int(size * height / width)
else: else:
oh = size oh = size
if max_size is not None and raw_size is not None:
ow = int(raw_size * width / height)
else:
ow = int(size * width / height) ow = int(size * width / height)
return (oh, ow) return (oh, ow)

View File

@@ -101,9 +101,11 @@ def get_max_height_width(
return (max_height, max_width) return (max_height, max_width)
def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]: def get_size_with_aspect_ratio(
image_size: Tuple[int, int], size: int, max_size: Optional[int] = None, mod_size: int = 16
) -> Tuple[int, int]:
""" """
Computes the output image size given the input image size and the desired output size. Computes the output image size given the input image size and the desired output size with multiple of divisible_size.
Args: Args:
image_size (`Tuple[int, int]`): image_size (`Tuple[int, int]`):
@@ -112,25 +114,40 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
The desired output size. The desired output size.
max_size (`int`, *optional*): max_size (`int`, *optional*):
The maximum allowed output size. The maximum allowed output size.
mod_size (`int`, *optional*):
The size to make multiple of mod_size.
""" """
height, width = image_size height, width = image_size
raw_size = None
if max_size is not None: if max_size is not None:
min_original_size = float(min((height, width))) min_original_size = float(min((height, width)))
max_original_size = float(max((height, width))) max_original_size = float(max((height, width)))
if max_original_size / min_original_size * size > max_size: if max_original_size / min_original_size * size > max_size:
size = int(round(max_size * min_original_size / max_original_size)) raw_size = max_size * min_original_size / max_original_size
size = int(round(raw_size))
if width <= height and width != size: if width < height:
height = int(size * height / width) ow = size
width = size if max_size is not None and raw_size is not None:
elif height < width and height != size: oh = int(raw_size * height / width)
width = int(size * width / height) else:
height = size oh = int(size * height / width)
width_mod = np.mod(width, 16) elif (height <= width and height == size) or (width <= height and width == size):
height_mod = np.mod(height, 16) oh, ow = height, width
width = width - width_mod else:
height = height - height_mod oh = size
return (height, width) if max_size is not None and raw_size is not None:
ow = int(raw_size * width / height)
else:
ow = int(size * width / height)
if mod_size is not None:
ow_mod = np.mod(ow, mod_size)
oh_mod = np.mod(oh, mod_size)
ow = ow - ow_mod
oh = oh - oh_mod
return (oh, ow)
# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width # Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width

View File

@@ -537,3 +537,55 @@ class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcess
) )
inputs = image_processor(images=[image_1, image_2], return_tensors="pt") inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
def test_longest_edge_shortest_edge_resizing_strategy(self):
image_1 = torch.ones([958, 653, 3], dtype=torch.uint8)
# max size is set; width < height;
# do_pad=False, longest_edge=640, shortest_edge=640, image=958x653 -> 640x436
image_processor = ConditionalDetrImageProcessor(
size={"longest_edge": 640, "shortest_edge": 640},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 640, 436]))
image_2 = torch.ones([653, 958, 3], dtype=torch.uint8)
# max size is set; height < width;
# do_pad=False, longest_edge=640, shortest_edge=640, image=653x958 -> 436x640
image_processor = ConditionalDetrImageProcessor(
size={"longest_edge": 640, "shortest_edge": 640},
do_pad=False,
)
inputs = image_processor(images=[image_2], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 436, 640]))
image_3 = torch.ones([100, 120, 3], dtype=torch.uint8)
# max size is set; width == size; height > max_size;
# do_pad=False, longest_edge=118, shortest_edge=100, image=120x100 -> 118x98
image_processor = ConditionalDetrImageProcessor(
size={"longest_edge": 118, "shortest_edge": 100},
do_pad=False,
)
inputs = image_processor(images=[image_3], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 98, 118]))
image_4 = torch.ones([128, 50, 3], dtype=torch.uint8)
# max size is set; height == size; width < max_size;
# do_pad=False, longest_edge=256, shortest_edge=50, image=50x128 -> 50x128
image_processor = ConditionalDetrImageProcessor(
size={"longest_edge": 256, "shortest_edge": 50},
do_pad=False,
)
inputs = image_processor(images=[image_4], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 128, 50]))
image_5 = torch.ones([50, 50, 3], dtype=torch.uint8)
# max size is set; height == width; width < max_size;
# do_pad=False, longest_edge=117, shortest_edge=50, image=50x50 -> 50x50
image_processor = ConditionalDetrImageProcessor(
size={"longest_edge": 117, "shortest_edge": 50},
do_pad=False,
)
inputs = image_processor(images=[image_5], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50]))

View File

@@ -539,3 +539,55 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
) )
inputs = image_processor(images=[image_1, image_2], return_tensors="pt") inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
def test_longest_edge_shortest_edge_resizing_strategy(self):
image_1 = torch.ones([958, 653, 3], dtype=torch.uint8)
# max size is set; width < height;
# do_pad=False, longest_edge=640, shortest_edge=640, image=958x653 -> 640x436
image_processor = DeformableDetrImageProcessor(
size={"longest_edge": 640, "shortest_edge": 640},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 640, 436]))
image_2 = torch.ones([653, 958, 3], dtype=torch.uint8)
# max size is set; height < width;
# do_pad=False, longest_edge=640, shortest_edge=640, image=653x958 -> 436x640
image_processor = DeformableDetrImageProcessor(
size={"longest_edge": 640, "shortest_edge": 640},
do_pad=False,
)
inputs = image_processor(images=[image_2], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 436, 640]))
image_3 = torch.ones([100, 120, 3], dtype=torch.uint8)
# max size is set; width == size; height > max_size;
# do_pad=False, longest_edge=118, shortest_edge=100, image=120x100 -> 118x98
image_processor = DeformableDetrImageProcessor(
size={"longest_edge": 118, "shortest_edge": 100},
do_pad=False,
)
inputs = image_processor(images=[image_3], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 98, 118]))
image_4 = torch.ones([128, 50, 3], dtype=torch.uint8)
# max size is set; height == size; width < max_size;
# do_pad=False, longest_edge=256, shortest_edge=50, image=50x128 -> 50x128
image_processor = DeformableDetrImageProcessor(
size={"longest_edge": 256, "shortest_edge": 50},
do_pad=False,
)
inputs = image_processor(images=[image_4], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 128, 50]))
image_5 = torch.ones([50, 50, 3], dtype=torch.uint8)
# max size is set; height == width; width < max_size;
# do_pad=False, longest_edge=117, shortest_edge=50, image=50x50 -> 50x50
image_processor = DeformableDetrImageProcessor(
size={"longest_edge": 117, "shortest_edge": 50},
do_pad=False,
)
inputs = image_processor(images=[image_5], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50]))

View File

@@ -593,3 +593,55 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
) )
inputs = image_processor(images=[image_1, image_2], return_tensors="pt") inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
def test_longest_edge_shortest_edge_resizing_strategy(self):
image_1 = torch.ones([958, 653, 3], dtype=torch.uint8)
# max size is set; width < height;
# do_pad=False, longest_edge=640, shortest_edge=640, image=958x653 -> 640x436
image_processor = DetrImageProcessor(
size={"longest_edge": 640, "shortest_edge": 640},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 640, 436]))
image_2 = torch.ones([653, 958, 3], dtype=torch.uint8)
# max size is set; height < width;
# do_pad=False, longest_edge=640, shortest_edge=640, image=653x958 -> 436x640
image_processor = DetrImageProcessor(
size={"longest_edge": 640, "shortest_edge": 640},
do_pad=False,
)
inputs = image_processor(images=[image_2], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 436, 640]))
image_3 = torch.ones([100, 120, 3], dtype=torch.uint8)
# max size is set; width == size; height > max_size;
# do_pad=False, longest_edge=118, shortest_edge=100, image=120x100 -> 118x98
image_processor = DetrImageProcessor(
size={"longest_edge": 118, "shortest_edge": 100},
do_pad=False,
)
inputs = image_processor(images=[image_3], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 98, 118]))
image_4 = torch.ones([128, 50, 3], dtype=torch.uint8)
# max size is set; height == size; width < max_size;
# do_pad=False, longest_edge=256, shortest_edge=50, image=50x128 -> 50x128
image_processor = DetrImageProcessor(
size={"longest_edge": 256, "shortest_edge": 50},
do_pad=False,
)
inputs = image_processor(images=[image_4], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 128, 50]))
image_5 = torch.ones([50, 50, 3], dtype=torch.uint8)
# max size is set; height == width; width < max_size;
# do_pad=False, longest_edge=117, shortest_edge=50, image=50x50 -> 50x50
image_processor = DetrImageProcessor(
size={"longest_edge": 117, "shortest_edge": 50},
do_pad=False,
)
inputs = image_processor(images=[image_5], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50]))

View File

@@ -575,3 +575,55 @@ class GroundingDinoImageProcessingTest(AnnotationFormatTestMixin, ImageProcessin
) )
inputs = image_processor(images=[image_1, image_2], return_tensors="pt") inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
def test_longest_edge_shortest_edge_resizing_strategy(self):
image_1 = torch.ones([958, 653, 3], dtype=torch.uint8)
# max size is set; width < height;
# do_pad=False, longest_edge=640, shortest_edge=640, image=958x653 -> 640x436
image_processor = GroundingDinoImageProcessor(
size={"longest_edge": 640, "shortest_edge": 640},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 640, 436]))
image_2 = torch.ones([653, 958, 3], dtype=torch.uint8)
# max size is set; height < width;
# do_pad=False, longest_edge=640, shortest_edge=640, image=653x958 -> 436x640
image_processor = GroundingDinoImageProcessor(
size={"longest_edge": 640, "shortest_edge": 640},
do_pad=False,
)
inputs = image_processor(images=[image_2], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 436, 640]))
image_3 = torch.ones([100, 120, 3], dtype=torch.uint8)
# max size is set; width == size; height > max_size;
# do_pad=False, longest_edge=118, shortest_edge=100, image=120x100 -> 118x98
image_processor = GroundingDinoImageProcessor(
size={"longest_edge": 118, "shortest_edge": 100},
do_pad=False,
)
inputs = image_processor(images=[image_3], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 98, 118]))
image_4 = torch.ones([128, 50, 3], dtype=torch.uint8)
# max size is set; height == size; width < max_size;
# do_pad=False, longest_edge=256, shortest_edge=50, image=50x128 -> 50x128
image_processor = GroundingDinoImageProcessor(
size={"longest_edge": 256, "shortest_edge": 50},
do_pad=False,
)
inputs = image_processor(images=[image_4], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 128, 50]))
image_5 = torch.ones([50, 50, 3], dtype=torch.uint8)
# max size is set; height == width; width < max_size;
# do_pad=False, longest_edge=117, shortest_edge=50, image=50x50 -> 50x50
image_processor = GroundingDinoImageProcessor(
size={"longest_edge": 117, "shortest_edge": 50},
do_pad=False,
)
inputs = image_processor(images=[image_5], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50]))