Add post_process_depth_estimation to image processors and support ZoeDepth's inference intricacies (#32550)
* add colorize_depth and matplotlib availability check * add post_process_depth_estimation for zoedepth + tests * add post_process_depth_estimation for DPT + tests * add post_process_depth_estimation in DepthEstimationPipeline & special case for zoedepth * run `make fixup` * fix import related error on tests * fix more import related errors on test * forgot some `torch` calls in declerations * remove `torch` call in zoedepth tests that caused error * updated docs for depth estimation * small fix for `colorize` input/output types * remove `colorize_depth`, fix various names, remove matplotlib dependency * fix formatting * run fixup * different images for test * update examples in `forward` functions * fixed broken links * fix output types for docs * possible format fix inside `<Tip>` * Readability related updates Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com> * Readability related update * cleanup after merge * refactor `post_process_depth_estimation` to return dict; simplify ZoeDepth's `post_process_depth_estimation` * rewrite dict merging to support python 3.8 --------- Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
This commit is contained in:
committed by
GitHub
parent
104599d7a8
commit
c31a6ff474
@@ -384,3 +384,29 @@ class DPTModelIntegrationTest(unittest.TestCase):
|
||||
segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
|
||||
expected_shape = torch.Size((480, 480))
|
||||
self.assertEqual(segmentation[0].shape, expected_shape)
|
||||
|
||||
def test_post_processing_depth_estimation(self):
|
||||
image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large")
|
||||
model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")
|
||||
|
||||
image = prepare_img()
|
||||
inputs = image_processor(images=image, return_tensors="pt")
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs)
|
||||
|
||||
predicted_depth = image_processor.post_process_depth_estimation(outputs=outputs)[0]["predicted_depth"]
|
||||
expected_shape = torch.Size((384, 384))
|
||||
self.assertTrue(predicted_depth.shape == expected_shape)
|
||||
|
||||
predicted_depth_l = image_processor.post_process_depth_estimation(outputs=outputs, target_sizes=[(500, 500)])
|
||||
predicted_depth_l = predicted_depth_l[0]["predicted_depth"]
|
||||
expected_shape = torch.Size((500, 500))
|
||||
self.assertTrue(predicted_depth_l.shape == expected_shape)
|
||||
|
||||
output_enlarged = torch.nn.functional.interpolate(
|
||||
predicted_depth.unsqueeze(0).unsqueeze(1), size=(500, 500), mode="bicubic", align_corners=False
|
||||
).squeeze()
|
||||
self.assertTrue(output_enlarged.shape == expected_shape)
|
||||
self.assertTrue(torch.allclose(predicted_depth_l, output_enlarged, rtol=1e-3))
|
||||
|
||||
@@ -16,6 +16,8 @@
|
||||
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from transformers import Dinov2Config, ZoeDepthConfig
|
||||
from transformers.file_utils import is_torch_available, is_vision_available
|
||||
from transformers.testing_utils import require_torch, require_vision, slow, torch_device
|
||||
@@ -212,6 +214,25 @@ def prepare_img():
|
||||
@require_vision
|
||||
@slow
|
||||
class ZoeDepthModelIntegrationTest(unittest.TestCase):
|
||||
expected_slice_post_processing = {
|
||||
(False, False): [
|
||||
[[1.1348238, 1.1193453, 1.130562], [1.1754476, 1.1613507, 1.1701596], [1.2287744, 1.2101802, 1.2148322]],
|
||||
[[2.7170, 2.6550, 2.6839], [2.9827, 2.9438, 2.9587], [3.2340, 3.1817, 3.1602]],
|
||||
],
|
||||
(False, True): [
|
||||
[[1.0610938, 1.1042216, 1.1429265], [1.1099341, 1.148696, 1.1817775], [1.1656011, 1.1988826, 1.2268101]],
|
||||
[[2.5848, 2.7391, 2.8694], [2.7882, 2.9872, 3.1244], [2.9436, 3.1812, 3.3188]],
|
||||
],
|
||||
(True, False): [
|
||||
[[1.8382794, 1.8380532, 1.8375976], [1.848761, 1.8485023, 1.8479986], [1.8571457, 1.8568444, 1.8562847]],
|
||||
[[6.2030, 6.1902, 6.1777], [6.2303, 6.2176, 6.2053], [6.2561, 6.2436, 6.2312]],
|
||||
],
|
||||
(True, True): [
|
||||
[[1.8306141, 1.8305621, 1.8303483], [1.8410318, 1.8409299, 1.8406585], [1.8492792, 1.8491366, 1.8488203]],
|
||||
[[6.2616, 6.2520, 6.2435], [6.2845, 6.2751, 6.2667], [6.3065, 6.2972, 6.2887]],
|
||||
],
|
||||
} # (pad, flip)
|
||||
|
||||
def test_inference_depth_estimation(self):
|
||||
image_processor = ZoeDepthImageProcessor.from_pretrained("Intel/zoedepth-nyu")
|
||||
model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu").to(torch_device)
|
||||
@@ -255,3 +276,81 @@ class ZoeDepthModelIntegrationTest(unittest.TestCase):
|
||||
).to(torch_device)
|
||||
|
||||
self.assertTrue(torch.allclose(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4))
|
||||
|
||||
def check_target_size(
|
||||
self,
|
||||
image_processor,
|
||||
pad_input,
|
||||
images,
|
||||
outputs,
|
||||
raw_outputs,
|
||||
raw_outputs_flipped=None,
|
||||
):
|
||||
outputs_large = image_processor.post_process_depth_estimation(
|
||||
raw_outputs,
|
||||
[img.size[::-1] for img in images],
|
||||
outputs_flipped=raw_outputs_flipped,
|
||||
target_sizes=[tuple(np.array(img.size[::-1]) * 2) for img in images],
|
||||
do_remove_padding=pad_input,
|
||||
)
|
||||
|
||||
for img, out, out_l in zip(images, outputs, outputs_large):
|
||||
out = out["predicted_depth"]
|
||||
out_l = out_l["predicted_depth"]
|
||||
out_l_reduced = torch.nn.functional.interpolate(
|
||||
out_l.unsqueeze(0).unsqueeze(1), size=img.size[::-1], mode="bicubic", align_corners=False
|
||||
)
|
||||
self.assertTrue((np.array(out_l.shape)[::-1] == np.array(img.size) * 2).all())
|
||||
self.assertTrue(torch.allclose(out, out_l_reduced, rtol=2e-2))
|
||||
|
||||
def check_post_processing_test(self, image_processor, images, model, pad_input=True, flip_aug=True):
|
||||
inputs = image_processor(images=images, return_tensors="pt", do_pad=pad_input).to(torch_device)
|
||||
|
||||
with torch.no_grad():
|
||||
raw_outputs = model(**inputs)
|
||||
raw_outputs_flipped = None
|
||||
if flip_aug:
|
||||
raw_outputs_flipped = model(pixel_values=torch.flip(inputs.pixel_values, dims=[3]))
|
||||
|
||||
outputs = image_processor.post_process_depth_estimation(
|
||||
raw_outputs,
|
||||
[img.size[::-1] for img in images],
|
||||
outputs_flipped=raw_outputs_flipped,
|
||||
do_remove_padding=pad_input,
|
||||
)
|
||||
|
||||
expected_slices = torch.tensor(self.expected_slice_post_processing[pad_input, flip_aug]).to(torch_device)
|
||||
for img, out, expected_slice in zip(images, outputs, expected_slices):
|
||||
out = out["predicted_depth"]
|
||||
self.assertTrue(img.size == out.shape[::-1])
|
||||
self.assertTrue(torch.allclose(expected_slice, out[:3, :3], rtol=1e-3))
|
||||
|
||||
self.check_target_size(image_processor, pad_input, images, outputs, raw_outputs, raw_outputs_flipped)
|
||||
|
||||
def test_post_processing_depth_estimation_post_processing_nopad_noflip(self):
|
||||
images = [prepare_img(), Image.open("./tests/fixtures/tests_samples/COCO/000000004016.png")]
|
||||
image_processor = ZoeDepthImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti", keep_aspect_ratio=False)
|
||||
model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti").to(torch_device)
|
||||
|
||||
self.check_post_processing_test(image_processor, images, model, pad_input=False, flip_aug=False)
|
||||
|
||||
def test_inference_depth_estimation_post_processing_nopad_flip(self):
|
||||
images = [prepare_img(), Image.open("./tests/fixtures/tests_samples/COCO/000000004016.png")]
|
||||
image_processor = ZoeDepthImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti", keep_aspect_ratio=False)
|
||||
model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti").to(torch_device)
|
||||
|
||||
self.check_post_processing_test(image_processor, images, model, pad_input=False, flip_aug=True)
|
||||
|
||||
def test_inference_depth_estimation_post_processing_pad_noflip(self):
|
||||
images = [prepare_img(), Image.open("./tests/fixtures/tests_samples/COCO/000000004016.png")]
|
||||
image_processor = ZoeDepthImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti", keep_aspect_ratio=False)
|
||||
model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti").to(torch_device)
|
||||
|
||||
self.check_post_processing_test(image_processor, images, model, pad_input=True, flip_aug=False)
|
||||
|
||||
def test_inference_depth_estimation_post_processing_pad_flip(self):
|
||||
images = [prepare_img(), Image.open("./tests/fixtures/tests_samples/COCO/000000004016.png")]
|
||||
image_processor = ZoeDepthImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti", keep_aspect_ratio=False)
|
||||
model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti").to(torch_device)
|
||||
|
||||
self.check_post_processing_test(image_processor, images, model, pad_input=True, flip_aug=True)
|
||||
|
||||
Reference in New Issue
Block a user