Add post_process_depth_estimation to image processors and support ZoeDepth's inference intricacies (#32550)

* add colorize_depth and matplotlib availability check * add post_process_depth_estimation for zoedepth + tests * add post_process_depth_estimation for DPT + tests * add post_process_depth_estimation in DepthEstimationPipeline & special case for zoedepth * run `make fixup` * fix import related error on tests * fix more import related errors on test * forgot some `torch` calls in declerations * remove `torch` call in zoedepth tests that caused error * updated docs for depth estimation * small fix for `colorize` input/output types * remove `colorize_depth`, fix various names, remove matplotlib dependency * fix formatting * run fixup * different images for test * update examples in `forward` functions * fixed broken links * fix output types for docs * possible format fix inside `<Tip>` * Readability related updates Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com> * Readability related update * cleanup after merge * refactor `post_process_depth_estimation` to return dict; simplify ZoeDepth's `post_process_depth_estimation` * rewrite dict merging to support python 3.8 --------- Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
2024-10-22 16:50:54 +03:00
parent 104599d7a8
commit c31a6ff474
13 changed files with 437 additions and 203 deletions
--- a/tests/models/dpt/test_modeling_dpt.py
+++ b/tests/models/dpt/test_modeling_dpt.py
@@ -384,3 +384,29 @@ class DPTModelIntegrationTest(unittest.TestCase):
        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
        expected_shape = torch.Size((480, 480))
        self.assertEqual(segmentation[0].shape, expected_shape)
+
+    def test_post_processing_depth_estimation(self):
+        image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large")
+        model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")
+
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt")
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        predicted_depth = image_processor.post_process_depth_estimation(outputs=outputs)[0]["predicted_depth"]
+        expected_shape = torch.Size((384, 384))
+        self.assertTrue(predicted_depth.shape == expected_shape)
+
+        predicted_depth_l = image_processor.post_process_depth_estimation(outputs=outputs, target_sizes=[(500, 500)])
+        predicted_depth_l = predicted_depth_l[0]["predicted_depth"]
+        expected_shape = torch.Size((500, 500))
+        self.assertTrue(predicted_depth_l.shape == expected_shape)
+
+        output_enlarged = torch.nn.functional.interpolate(
+            predicted_depth.unsqueeze(0).unsqueeze(1), size=(500, 500), mode="bicubic", align_corners=False
+        ).squeeze()
+        self.assertTrue(output_enlarged.shape == expected_shape)
+        self.assertTrue(torch.allclose(predicted_depth_l, output_enlarged, rtol=1e-3))
--- a/tests/models/zoedepth/test_modeling_zoedepth.py
+++ b/tests/models/zoedepth/test_modeling_zoedepth.py
@@ -16,6 +16,8 @@

 import unittest

+import numpy as np
+
 from transformers import Dinov2Config, ZoeDepthConfig
 from transformers.file_utils import is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
@@ -212,6 +214,25 @@ def prepare_img():
@require_vision
@slow
 class ZoeDepthModelIntegrationTest(unittest.TestCase):
+    expected_slice_post_processing = {
+        (False, False): [
+            [[1.1348238, 1.1193453, 1.130562], [1.1754476, 1.1613507, 1.1701596], [1.2287744, 1.2101802, 1.2148322]],
+            [[2.7170, 2.6550, 2.6839], [2.9827, 2.9438, 2.9587], [3.2340, 3.1817, 3.1602]],
+        ],
+        (False, True): [
+            [[1.0610938, 1.1042216, 1.1429265], [1.1099341, 1.148696, 1.1817775], [1.1656011, 1.1988826, 1.2268101]],
+            [[2.5848, 2.7391, 2.8694], [2.7882, 2.9872, 3.1244], [2.9436, 3.1812, 3.3188]],
+        ],
+        (True, False): [
+            [[1.8382794, 1.8380532, 1.8375976], [1.848761, 1.8485023, 1.8479986], [1.8571457, 1.8568444, 1.8562847]],
+            [[6.2030, 6.1902, 6.1777], [6.2303, 6.2176, 6.2053], [6.2561, 6.2436, 6.2312]],
+        ],
+        (True, True): [
+            [[1.8306141, 1.8305621, 1.8303483], [1.8410318, 1.8409299, 1.8406585], [1.8492792, 1.8491366, 1.8488203]],
+            [[6.2616, 6.2520, 6.2435], [6.2845, 6.2751, 6.2667], [6.3065, 6.2972, 6.2887]],
+        ],
+    }  # (pad, flip)
+
    def test_inference_depth_estimation(self):
        image_processor = ZoeDepthImageProcessor.from_pretrained("Intel/zoedepth-nyu")
        model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu").to(torch_device)
@@ -255,3 +276,81 @@ class ZoeDepthModelIntegrationTest(unittest.TestCase):
        ).to(torch_device)

        self.assertTrue(torch.allclose(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4))
+
+    def check_target_size(
+        self,
+        image_processor,
+        pad_input,
+        images,
+        outputs,
+        raw_outputs,
+        raw_outputs_flipped=None,
+    ):
+        outputs_large = image_processor.post_process_depth_estimation(
+            raw_outputs,
+            [img.size[::-1] for img in images],
+            outputs_flipped=raw_outputs_flipped,
+            target_sizes=[tuple(np.array(img.size[::-1]) * 2) for img in images],
+            do_remove_padding=pad_input,
+        )
+
+        for img, out, out_l in zip(images, outputs, outputs_large):
+            out = out["predicted_depth"]
+            out_l = out_l["predicted_depth"]
+            out_l_reduced = torch.nn.functional.interpolate(
+                out_l.unsqueeze(0).unsqueeze(1), size=img.size[::-1], mode="bicubic", align_corners=False
+            )
+            self.assertTrue((np.array(out_l.shape)[::-1] == np.array(img.size) * 2).all())
+            self.assertTrue(torch.allclose(out, out_l_reduced, rtol=2e-2))
+
+    def check_post_processing_test(self, image_processor, images, model, pad_input=True, flip_aug=True):
+        inputs = image_processor(images=images, return_tensors="pt", do_pad=pad_input).to(torch_device)
+
+        with torch.no_grad():
+            raw_outputs = model(**inputs)
+            raw_outputs_flipped = None
+            if flip_aug:
+                raw_outputs_flipped = model(pixel_values=torch.flip(inputs.pixel_values, dims=[3]))
+
+        outputs = image_processor.post_process_depth_estimation(
+            raw_outputs,
+            [img.size[::-1] for img in images],
+            outputs_flipped=raw_outputs_flipped,
+            do_remove_padding=pad_input,
+        )
+
+        expected_slices = torch.tensor(self.expected_slice_post_processing[pad_input, flip_aug]).to(torch_device)
+        for img, out, expected_slice in zip(images, outputs, expected_slices):
+            out = out["predicted_depth"]
+            self.assertTrue(img.size == out.shape[::-1])
+            self.assertTrue(torch.allclose(expected_slice, out[:3, :3], rtol=1e-3))
+
+        self.check_target_size(image_processor, pad_input, images, outputs, raw_outputs, raw_outputs_flipped)
+
+    def test_post_processing_depth_estimation_post_processing_nopad_noflip(self):
+        images = [prepare_img(), Image.open("./tests/fixtures/tests_samples/COCO/000000004016.png")]
+        image_processor = ZoeDepthImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti", keep_aspect_ratio=False)
+        model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti").to(torch_device)
+
+        self.check_post_processing_test(image_processor, images, model, pad_input=False, flip_aug=False)
+
+    def test_inference_depth_estimation_post_processing_nopad_flip(self):
+        images = [prepare_img(), Image.open("./tests/fixtures/tests_samples/COCO/000000004016.png")]
+        image_processor = ZoeDepthImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti", keep_aspect_ratio=False)
+        model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti").to(torch_device)
+
+        self.check_post_processing_test(image_processor, images, model, pad_input=False, flip_aug=True)
+
+    def test_inference_depth_estimation_post_processing_pad_noflip(self):
+        images = [prepare_img(), Image.open("./tests/fixtures/tests_samples/COCO/000000004016.png")]
+        image_processor = ZoeDepthImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti", keep_aspect_ratio=False)
+        model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti").to(torch_device)
+
+        self.check_post_processing_test(image_processor, images, model, pad_input=True, flip_aug=False)
+
+    def test_inference_depth_estimation_post_processing_pad_flip(self):
+        images = [prepare_img(), Image.open("./tests/fixtures/tests_samples/COCO/000000004016.png")]
+        image_processor = ZoeDepthImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti", keep_aspect_ratio=False)
+        model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti").to(torch_device)
+
+        self.check_post_processing_test(image_processor, images, model, pad_input=True, flip_aug=True)