From 1d7773594754457ed4a79cf6d98bcaabea5bff51 Mon Sep 17 00:00:00 2001 From: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Date: Wed, 20 Dec 2023 20:55:51 +0000 Subject: [PATCH] Fix yolos resizing (#27663) * Fix yolos resizing * Update tests * Add a test --- .../models/detr/image_processing_detr.py | 1 + .../models/yolos/image_processing_yolos.py | 22 +++++----- .../yolos/test_image_processing_yolos.py | 44 ++++++++++++++----- 3 files changed, 45 insertions(+), 22 deletions(-) diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py index 24c36c5d10..98fce25624 100644 --- a/src/transformers/models/detr/image_processing_detr.py +++ b/src/transformers/models/detr/image_processing_detr.py @@ -82,6 +82,7 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) +# From the original repo: https://github.com/facebookresearch/detr/blob/3af9fa878e73b6894ce3596450a8d9b89d918ca9/datasets/transforms.py#L76 def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]: """ Computes the output image size given the input image size and the desired output size. diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py index 3b0c635c0e..6b9aba42e5 100644 --- a/src/transformers/models/yolos/image_processing_yolos.py +++ b/src/transformers/models/yolos/image_processing_yolos.py @@ -99,7 +99,6 @@ def get_max_height_width( return (max_height, max_width) -# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]: """ Computes the output image size given the input image size and the desired output size. @@ -119,16 +118,17 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in if max_original_size / min_original_size * size > max_size: size = int(round(max_size * min_original_size / max_original_size)) - if (height <= width and height == size) or (width <= height and width == size): - return height, width - - if width < height: - ow = size - oh = int(size * height / width) - else: - oh = size - ow = int(size * width / height) - return (oh, ow) + if width < height and width != size: + height = int(size * height / width) + width = size + elif height < width and height != size: + width = int(size * width / height) + height = size + width_mod = np.mod(width, 16) + height_mod = np.mod(height, 16) + width = width - width_mod + height = height - height_mod + return (height, width) # Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size diff --git a/tests/models/yolos/test_image_processing_yolos.py b/tests/models/yolos/test_image_processing_yolos.py index 1039e4c91b..558c3f7391 100644 --- a/tests/models/yolos/test_image_processing_yolos.py +++ b/tests/models/yolos/test_image_processing_yolos.py @@ -86,18 +86,28 @@ class YolosImageProcessingTester(unittest.TestCase): if not batched: image = image_inputs[0] if isinstance(image, Image.Image): - w, h = image.size + width, height = image.size else: - h, w = image.shape[1], image.shape[2] - if w < h: - expected_height = int(self.size["shortest_edge"] * h / w) - expected_width = self.size["shortest_edge"] - elif w > h: - expected_height = self.size["shortest_edge"] - expected_width = int(self.size["shortest_edge"] * w / h) - else: - expected_height = self.size["shortest_edge"] - expected_width = self.size["shortest_edge"] + height, width = image.shape[1], image.shape[2] + + size = self.size["shortest_edge"] + max_size = self.size.get("longest_edge", None) + if max_size is not None: + min_original_size = float(min((height, width))) + max_original_size = float(max((height, width))) + if max_original_size / min_original_size * size > max_size: + size = int(round(max_size * min_original_size / max_original_size)) + + if width < height and width != size: + height = int(size * height / width) + width = size + elif height < width and height != size: + width = int(size * width / height) + height = size + width_mod = width % 16 + height_mod = height % 16 + expected_width = width - width_mod + expected_height = height - height_mod else: expected_values = [] @@ -173,6 +183,18 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4) ) + def test_resize_max_size_respected(self): + image_processor = self.image_processing_class(**self.image_processor_dict) + + # create torch tensors as image + image = torch.randint(0, 256, (3, 100, 1500), dtype=torch.uint8) + processed_image = image_processor( + image, size={"longest_edge": 1333, "shortest_edge": 800}, do_pad=False, return_tensors="pt" + )["pixel_values"] + + self.assertTrue(processed_image.shape[-1] <= 1333) + self.assertTrue(processed_image.shape[-2] <= 800) + @slow def test_call_pytorch_with_coco_detection_annotations(self): # prepare image and target