Add fixed resize and pad strategy for object detection (#30742)

* Add resize and pad strategy * Merge get_size functions * Add pad_size + tests to object detection models * Fixup * Update docstrings * Fixup
2024-05-17 16:21:26 +01:00
parent e9a8041d1c
commit bf646fbf2d
13 changed files with 929 additions and 89 deletions
--- a/tests/models/conditional_detr/test_image_processing_conditional_detr.py
+++ b/tests/models/conditional_detr/test_image_processing_conditional_detr.py
@@ -490,3 +490,50 @@ class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcess
        ).T
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
+
+    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->ConditionalDetr
+    def test_max_width_max_height_resizing_and_pad_strategy(self):
+        image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
+
+        # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
+        image_processor = ConditionalDetrImageProcessor(
+            size={"max_height": 100, "max_width": 100},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
+
+        # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
+        image_processor = ConditionalDetrImageProcessor(
+            size={"max_height": 300, "max_width": 100},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+
+        # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
+        image_processor = ConditionalDetrImageProcessor(
+            size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
+
+        # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
+        image_processor = ConditionalDetrImageProcessor(
+            size={"max_height": 300, "max_width": 100},
+            do_pad=True,
+            pad_size={"height": 301, "width": 101},
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
+
+        ### Check for batch
+        image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
+
+        # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
+        image_processor = ConditionalDetrImageProcessor(
+            size={"max_height": 150, "max_width": 100},
+            do_pad=True,
+            pad_size={"height": 150, "width": 100},
+        )
+        inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
--- a/tests/models/deformable_detr/test_image_processing_deformable_detr.py
+++ b/tests/models/deformable_detr/test_image_processing_deformable_detr.py
@@ -492,3 +492,50 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
        ).T
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
+
+    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->DeformableDetr
+    def test_max_width_max_height_resizing_and_pad_strategy(self):
+        image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
+
+        # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
+        image_processor = DeformableDetrImageProcessor(
+            size={"max_height": 100, "max_width": 100},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
+
+        # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
+        image_processor = DeformableDetrImageProcessor(
+            size={"max_height": 300, "max_width": 100},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+
+        # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
+        image_processor = DeformableDetrImageProcessor(
+            size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
+
+        # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
+        image_processor = DeformableDetrImageProcessor(
+            size={"max_height": 300, "max_width": 100},
+            do_pad=True,
+            pad_size={"height": 301, "width": 101},
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
+
+        ### Check for batch
+        image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
+
+        # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
+        image_processor = DeformableDetrImageProcessor(
+            size={"max_height": 150, "max_width": 100},
+            do_pad=True,
+            pad_size={"height": 150, "width": 100},
+        )
+        inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
--- a/tests/models/deta/test_image_processing_deta.py
+++ b/tests/models/deta/test_image_processing_deta.py
@@ -486,3 +486,50 @@ class DetaImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
        ).T
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
+
+    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->Deta
+    def test_max_width_max_height_resizing_and_pad_strategy(self):
+        image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
+
+        # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
+        image_processor = DetaImageProcessor(
+            size={"max_height": 100, "max_width": 100},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
+
+        # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
+        image_processor = DetaImageProcessor(
+            size={"max_height": 300, "max_width": 100},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+
+        # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
+        image_processor = DetaImageProcessor(
+            size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
+
+        # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
+        image_processor = DetaImageProcessor(
+            size={"max_height": 300, "max_width": 100},
+            do_pad=True,
+            pad_size={"height": 301, "width": 101},
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
+
+        ### Check for batch
+        image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
+
+        # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
+        image_processor = DetaImageProcessor(
+            size={"max_height": 150, "max_width": 100},
+            do_pad=True,
+            pad_size={"height": 150, "width": 100},
+        )
+        inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
--- a/tests/models/detr/test_image_processing_detr.py
+++ b/tests/models/detr/test_image_processing_detr.py
@@ -547,3 +547,49 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
        ).T
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
+
+    def test_max_width_max_height_resizing_and_pad_strategy(self):
+        image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
+
+        # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
+        image_processor = DetrImageProcessor(
+            size={"max_height": 100, "max_width": 100},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
+
+        # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
+        image_processor = DetrImageProcessor(
+            size={"max_height": 300, "max_width": 100},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+
+        # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
+        image_processor = DetrImageProcessor(
+            size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
+
+        # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
+        image_processor = DetrImageProcessor(
+            size={"max_height": 300, "max_width": 100},
+            do_pad=True,
+            pad_size={"height": 301, "width": 101},
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
+
+        ### Check for batch
+        image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
+
+        # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
+        image_processor = DetrImageProcessor(
+            size={"max_height": 150, "max_width": 100},
+            do_pad=True,
+            pad_size={"height": 150, "width": 100},
+        )
+        inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
--- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py
+++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py
@@ -528,3 +528,50 @@ class GroundingDinoImageProcessingTest(AnnotationFormatTestMixin, ImageProcessin
        ).T
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
+
+    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->GroundingDino
+    def test_max_width_max_height_resizing_and_pad_strategy(self):
+        image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
+
+        # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
+        image_processor = GroundingDinoImageProcessor(
+            size={"max_height": 100, "max_width": 100},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
+
+        # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
+        image_processor = GroundingDinoImageProcessor(
+            size={"max_height": 300, "max_width": 100},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+
+        # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
+        image_processor = GroundingDinoImageProcessor(
+            size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
+
+        # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
+        image_processor = GroundingDinoImageProcessor(
+            size={"max_height": 300, "max_width": 100},
+            do_pad=True,
+            pad_size={"height": 301, "width": 101},
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
+
+        ### Check for batch
+        image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
+
+        # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
+        image_processor = GroundingDinoImageProcessor(
+            size={"max_height": 150, "max_width": 100},
+            do_pad=True,
+            pad_size={"height": 150, "width": 100},
+        )
+        inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
--- a/tests/models/yolos/test_image_processing_yolos.py
+++ b/tests/models/yolos/test_image_processing_yolos.py
@@ -546,3 +546,50 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
        ).T
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
+
+    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->Yolos
+    def test_max_width_max_height_resizing_and_pad_strategy(self):
+        image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
+
+        # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
+        image_processor = YolosImageProcessor(
+            size={"max_height": 100, "max_width": 100},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
+
+        # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
+        image_processor = YolosImageProcessor(
+            size={"max_height": 300, "max_width": 100},
+            do_pad=False,
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+
+        # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
+        image_processor = YolosImageProcessor(
+            size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
+
+        # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
+        image_processor = YolosImageProcessor(
+            size={"max_height": 300, "max_width": 100},
+            do_pad=True,
+            pad_size={"height": 301, "width": 101},
+        )
+        inputs = image_processor(images=[image_1], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
+
+        ### Check for batch
+        image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
+
+        # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
+        image_processor = YolosImageProcessor(
+            size={"max_height": 150, "max_width": 100},
+            do_pad=True,
+            pad_size={"height": 150, "width": 100},
+        )
+        inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
+        self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))