Add post_process_depth_estimation to image processors and support ZoeDepth's inference intricacies (#32550)

* add colorize_depth and matplotlib availability check

* add post_process_depth_estimation for zoedepth + tests

* add post_process_depth_estimation for DPT + tests

* add post_process_depth_estimation in DepthEstimationPipeline & special case for zoedepth

* run `make fixup`

* fix import related error on tests

* fix more import related errors on test

* forgot some `torch` calls in declerations

* remove `torch` call in zoedepth tests that caused error

* updated docs for depth estimation

* small fix for `colorize` input/output types

* remove `colorize_depth`, fix various names, remove matplotlib dependency

* fix formatting

* run fixup

* different images for test

* update examples in `forward` functions

* fixed broken links

* fix output types for docs

* possible format fix inside `<Tip>`

* Readability related updates

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>

* Readability related update

* cleanup after merge

* refactor `post_process_depth_estimation` to return dict; simplify ZoeDepth's `post_process_depth_estimation`

* rewrite dict merging to support python 3.8

---------

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
This commit is contained in:
Alexandros Benetatos
2024-10-22 16:50:54 +03:00
committed by GitHub
parent 104599d7a8
commit c31a6ff474
13 changed files with 437 additions and 203 deletions

View File

@@ -384,3 +384,29 @@ class DPTModelIntegrationTest(unittest.TestCase):
segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
expected_shape = torch.Size((480, 480))
self.assertEqual(segmentation[0].shape, expected_shape)
def test_post_processing_depth_estimation(self):
image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large")
model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")
image = prepare_img()
inputs = image_processor(images=image, return_tensors="pt")
# forward pass
with torch.no_grad():
outputs = model(**inputs)
predicted_depth = image_processor.post_process_depth_estimation(outputs=outputs)[0]["predicted_depth"]
expected_shape = torch.Size((384, 384))
self.assertTrue(predicted_depth.shape == expected_shape)
predicted_depth_l = image_processor.post_process_depth_estimation(outputs=outputs, target_sizes=[(500, 500)])
predicted_depth_l = predicted_depth_l[0]["predicted_depth"]
expected_shape = torch.Size((500, 500))
self.assertTrue(predicted_depth_l.shape == expected_shape)
output_enlarged = torch.nn.functional.interpolate(
predicted_depth.unsqueeze(0).unsqueeze(1), size=(500, 500), mode="bicubic", align_corners=False
).squeeze()
self.assertTrue(output_enlarged.shape == expected_shape)
self.assertTrue(torch.allclose(predicted_depth_l, output_enlarged, rtol=1e-3))

View File

@@ -16,6 +16,8 @@
import unittest
import numpy as np
from transformers import Dinov2Config, ZoeDepthConfig
from transformers.file_utils import is_torch_available, is_vision_available
from transformers.testing_utils import require_torch, require_vision, slow, torch_device
@@ -212,6 +214,25 @@ def prepare_img():
@require_vision
@slow
class ZoeDepthModelIntegrationTest(unittest.TestCase):
expected_slice_post_processing = {
(False, False): [
[[1.1348238, 1.1193453, 1.130562], [1.1754476, 1.1613507, 1.1701596], [1.2287744, 1.2101802, 1.2148322]],
[[2.7170, 2.6550, 2.6839], [2.9827, 2.9438, 2.9587], [3.2340, 3.1817, 3.1602]],
],
(False, True): [
[[1.0610938, 1.1042216, 1.1429265], [1.1099341, 1.148696, 1.1817775], [1.1656011, 1.1988826, 1.2268101]],
[[2.5848, 2.7391, 2.8694], [2.7882, 2.9872, 3.1244], [2.9436, 3.1812, 3.3188]],
],
(True, False): [
[[1.8382794, 1.8380532, 1.8375976], [1.848761, 1.8485023, 1.8479986], [1.8571457, 1.8568444, 1.8562847]],
[[6.2030, 6.1902, 6.1777], [6.2303, 6.2176, 6.2053], [6.2561, 6.2436, 6.2312]],
],
(True, True): [
[[1.8306141, 1.8305621, 1.8303483], [1.8410318, 1.8409299, 1.8406585], [1.8492792, 1.8491366, 1.8488203]],
[[6.2616, 6.2520, 6.2435], [6.2845, 6.2751, 6.2667], [6.3065, 6.2972, 6.2887]],
],
} # (pad, flip)
def test_inference_depth_estimation(self):
image_processor = ZoeDepthImageProcessor.from_pretrained("Intel/zoedepth-nyu")
model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu").to(torch_device)
@@ -255,3 +276,81 @@ class ZoeDepthModelIntegrationTest(unittest.TestCase):
).to(torch_device)
self.assertTrue(torch.allclose(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4))
def check_target_size(
self,
image_processor,
pad_input,
images,
outputs,
raw_outputs,
raw_outputs_flipped=None,
):
outputs_large = image_processor.post_process_depth_estimation(
raw_outputs,
[img.size[::-1] for img in images],
outputs_flipped=raw_outputs_flipped,
target_sizes=[tuple(np.array(img.size[::-1]) * 2) for img in images],
do_remove_padding=pad_input,
)
for img, out, out_l in zip(images, outputs, outputs_large):
out = out["predicted_depth"]
out_l = out_l["predicted_depth"]
out_l_reduced = torch.nn.functional.interpolate(
out_l.unsqueeze(0).unsqueeze(1), size=img.size[::-1], mode="bicubic", align_corners=False
)
self.assertTrue((np.array(out_l.shape)[::-1] == np.array(img.size) * 2).all())
self.assertTrue(torch.allclose(out, out_l_reduced, rtol=2e-2))
def check_post_processing_test(self, image_processor, images, model, pad_input=True, flip_aug=True):
inputs = image_processor(images=images, return_tensors="pt", do_pad=pad_input).to(torch_device)
with torch.no_grad():
raw_outputs = model(**inputs)
raw_outputs_flipped = None
if flip_aug:
raw_outputs_flipped = model(pixel_values=torch.flip(inputs.pixel_values, dims=[3]))
outputs = image_processor.post_process_depth_estimation(
raw_outputs,
[img.size[::-1] for img in images],
outputs_flipped=raw_outputs_flipped,
do_remove_padding=pad_input,
)
expected_slices = torch.tensor(self.expected_slice_post_processing[pad_input, flip_aug]).to(torch_device)
for img, out, expected_slice in zip(images, outputs, expected_slices):
out = out["predicted_depth"]
self.assertTrue(img.size == out.shape[::-1])
self.assertTrue(torch.allclose(expected_slice, out[:3, :3], rtol=1e-3))
self.check_target_size(image_processor, pad_input, images, outputs, raw_outputs, raw_outputs_flipped)
def test_post_processing_depth_estimation_post_processing_nopad_noflip(self):
images = [prepare_img(), Image.open("./tests/fixtures/tests_samples/COCO/000000004016.png")]
image_processor = ZoeDepthImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti", keep_aspect_ratio=False)
model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti").to(torch_device)
self.check_post_processing_test(image_processor, images, model, pad_input=False, flip_aug=False)
def test_inference_depth_estimation_post_processing_nopad_flip(self):
images = [prepare_img(), Image.open("./tests/fixtures/tests_samples/COCO/000000004016.png")]
image_processor = ZoeDepthImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti", keep_aspect_ratio=False)
model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti").to(torch_device)
self.check_post_processing_test(image_processor, images, model, pad_input=False, flip_aug=True)
def test_inference_depth_estimation_post_processing_pad_noflip(self):
images = [prepare_img(), Image.open("./tests/fixtures/tests_samples/COCO/000000004016.png")]
image_processor = ZoeDepthImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti", keep_aspect_ratio=False)
model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti").to(torch_device)
self.check_post_processing_test(image_processor, images, model, pad_input=True, flip_aug=False)
def test_inference_depth_estimation_post_processing_pad_flip(self):
images = [prepare_img(), Image.open("./tests/fixtures/tests_samples/COCO/000000004016.png")]
image_processor = ZoeDepthImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti", keep_aspect_ratio=False)
model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti").to(torch_device)
self.check_post_processing_test(image_processor, images, model, pad_input=True, flip_aug=True)