Add Depth Anything V2 Metric models (#32126)
* add checkpoint and repo names * adapt head to support metric depth estimation * add max_depth output scaling * add expected logits * improve docs * fix docstring * add checkpoint and repo names * adapt head to support metric depth estimation * add max_depth output scaling * add expected logits * improve docs * fix docstring * rename depth_estimation to depth_estimation_type * add integration test * Refactored tests to include metric depth model inference test * Integration test pass when the timm backbone lines are commented (L220-L227) * address feedback * replace model path to use organization path * formatting * delete deprecated TODO * address feedback * [run_slow] depth_anything
This commit is contained in:
@@ -27,7 +27,7 @@ logger = logging.get_logger(__name__)
|
|||||||
|
|
||||||
class DepthAnythingConfig(PretrainedConfig):
|
class DepthAnythingConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
This is the configuration class to store the configuration of a [`DepthAnythingModel`]. It is used to instantiate an DepthAnything
|
This is the configuration class to store the configuration of a [`DepthAnythingModel`]. It is used to instantiate a DepthAnything
|
||||||
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
||||||
defaults will yield a similar configuration to that of the DepthAnything
|
defaults will yield a similar configuration to that of the DepthAnything
|
||||||
[LiheYoung/depth-anything-small-hf](https://huggingface.co/LiheYoung/depth-anything-small-hf) architecture.
|
[LiheYoung/depth-anything-small-hf](https://huggingface.co/LiheYoung/depth-anything-small-hf) architecture.
|
||||||
@@ -67,6 +67,11 @@ class DepthAnythingConfig(PretrainedConfig):
|
|||||||
The index of the features to use in the depth estimation head.
|
The index of the features to use in the depth estimation head.
|
||||||
head_hidden_size (`int`, *optional*, defaults to 32):
|
head_hidden_size (`int`, *optional*, defaults to 32):
|
||||||
The number of output channels in the second convolution of the depth estimation head.
|
The number of output channels in the second convolution of the depth estimation head.
|
||||||
|
depth_estimation_type (`str`, *optional*, defaults to `"relative"`):
|
||||||
|
The type of depth estimation to use. Can be one of `["relative", "metric"]`.
|
||||||
|
max_depth (`float`, *optional*):
|
||||||
|
The maximum depth to use for the "metric" depth estimation head. 20 should be used for indoor models
|
||||||
|
and 80 for outdoor models. For "relative" depth estimation, this value is ignored.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
@@ -100,6 +105,8 @@ class DepthAnythingConfig(PretrainedConfig):
|
|||||||
fusion_hidden_size=64,
|
fusion_hidden_size=64,
|
||||||
head_in_index=-1,
|
head_in_index=-1,
|
||||||
head_hidden_size=32,
|
head_hidden_size=32,
|
||||||
|
depth_estimation_type="relative",
|
||||||
|
max_depth=None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
@@ -139,6 +146,10 @@ class DepthAnythingConfig(PretrainedConfig):
|
|||||||
self.fusion_hidden_size = fusion_hidden_size
|
self.fusion_hidden_size = fusion_hidden_size
|
||||||
self.head_in_index = head_in_index
|
self.head_in_index = head_in_index
|
||||||
self.head_hidden_size = head_hidden_size
|
self.head_hidden_size = head_hidden_size
|
||||||
|
if depth_estimation_type not in ["relative", "metric"]:
|
||||||
|
raise ValueError("depth_estimation_type must be one of ['relative', 'metric']")
|
||||||
|
self.depth_estimation_type = depth_estimation_type
|
||||||
|
self.max_depth = max_depth if max_depth else 1
|
||||||
|
|
||||||
def to_dict(self):
|
def to_dict(self):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -56,12 +56,21 @@ def get_dpt_config(model_name):
|
|||||||
else:
|
else:
|
||||||
raise NotImplementedError(f"Model not supported: {model_name}")
|
raise NotImplementedError(f"Model not supported: {model_name}")
|
||||||
|
|
||||||
|
if "metric" in model_name:
|
||||||
|
depth_estimation_type = "metric"
|
||||||
|
max_depth = 20 if "indoor" in model_name else 80
|
||||||
|
else:
|
||||||
|
depth_estimation_type = "relative"
|
||||||
|
max_depth = None
|
||||||
|
|
||||||
config = DepthAnythingConfig(
|
config = DepthAnythingConfig(
|
||||||
reassemble_hidden_size=backbone_config.hidden_size,
|
reassemble_hidden_size=backbone_config.hidden_size,
|
||||||
patch_size=backbone_config.patch_size,
|
patch_size=backbone_config.patch_size,
|
||||||
backbone_config=backbone_config,
|
backbone_config=backbone_config,
|
||||||
fusion_hidden_size=fusion_hidden_size,
|
fusion_hidden_size=fusion_hidden_size,
|
||||||
neck_hidden_sizes=neck_hidden_sizes,
|
neck_hidden_sizes=neck_hidden_sizes,
|
||||||
|
depth_estimation_type=depth_estimation_type,
|
||||||
|
max_depth=max_depth,
|
||||||
)
|
)
|
||||||
|
|
||||||
return config
|
return config
|
||||||
@@ -178,6 +187,12 @@ name_to_checkpoint = {
|
|||||||
"depth-anything-v2-small": "depth_anything_v2_vits.pth",
|
"depth-anything-v2-small": "depth_anything_v2_vits.pth",
|
||||||
"depth-anything-v2-base": "depth_anything_v2_vitb.pth",
|
"depth-anything-v2-base": "depth_anything_v2_vitb.pth",
|
||||||
"depth-anything-v2-large": "depth_anything_v2_vitl.pth",
|
"depth-anything-v2-large": "depth_anything_v2_vitl.pth",
|
||||||
|
"depth-anything-v2-metric-indoor-small": "depth_anything_v2_metric_hypersim_vits.pth",
|
||||||
|
"depth-anything-v2-metric-indoor-base": "depth_anything_v2_metric_hypersim_vitb.pth",
|
||||||
|
"depth-anything-v2-metric-indoor-large": "depth_anything_v2_metric_hypersim_vitl.pth",
|
||||||
|
"depth-anything-v2-metric-outdoor-small": "depth_anything_v2_metric_vkitti_vits.pth",
|
||||||
|
"depth-anything-v2-metric-outdoor-base": "depth_anything_v2_metric_vkitti_vitb.pth",
|
||||||
|
"depth-anything-v2-metric-outdoor-large": "depth_anything_v2_metric_vkitti_vitl.pth",
|
||||||
# v2-giant pending
|
# v2-giant pending
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -198,6 +213,12 @@ def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, ve
|
|||||||
"depth-anything-v2-small": "depth-anything/Depth-Anything-V2-Small",
|
"depth-anything-v2-small": "depth-anything/Depth-Anything-V2-Small",
|
||||||
"depth-anything-v2-base": "depth-anything/Depth-Anything-V2-Base",
|
"depth-anything-v2-base": "depth-anything/Depth-Anything-V2-Base",
|
||||||
"depth-anything-v2-large": "depth-anything/Depth-Anything-V2-Large",
|
"depth-anything-v2-large": "depth-anything/Depth-Anything-V2-Large",
|
||||||
|
"depth-anything-v2-metric-indoor-small": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Small",
|
||||||
|
"depth-anything-v2-metric-indoor-base": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Base",
|
||||||
|
"depth-anything-v2-metric-indoor-large": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Large",
|
||||||
|
"depth-anything-v2-metric-outdoor-small": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Small",
|
||||||
|
"depth-anything-v2-metric-outdoor-base": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Base",
|
||||||
|
"depth-anything-v2-metric-outdoor-large": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Large",
|
||||||
}
|
}
|
||||||
|
|
||||||
# load original state_dict
|
# load original state_dict
|
||||||
@@ -272,6 +293,30 @@ def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, ve
|
|||||||
expected_slice = torch.tensor(
|
expected_slice = torch.tensor(
|
||||||
[[162.2751, 161.8504, 162.8788], [160.3138, 160.8050, 161.9835], [159.3812, 159.9884, 160.0768]]
|
[[162.2751, 161.8504, 162.8788], [160.3138, 160.8050, 161.9835], [159.3812, 159.9884, 160.0768]]
|
||||||
)
|
)
|
||||||
|
elif model_name == "depth-anything-v2-metric-indoor-small":
|
||||||
|
expected_slice = torch.tensor(
|
||||||
|
[[1.3349, 1.2946, 1.2801], [1.2793, 1.2337, 1.2899], [1.2629, 1.2218, 1.2476]]
|
||||||
|
)
|
||||||
|
elif model_name == "depth-anything-v2-metric-indoor-base":
|
||||||
|
expected_slice = torch.tensor(
|
||||||
|
[[1.4601, 1.3824, 1.4904], [1.5031, 1.4349, 1.4274], [1.4570, 1.4578, 1.4200]]
|
||||||
|
)
|
||||||
|
elif model_name == "depth-anything-v2-metric-indoor-large":
|
||||||
|
expected_slice = torch.tensor(
|
||||||
|
[[1.5040, 1.5019, 1.5218], [1.5087, 1.5195, 1.5149], [1.5437, 1.5128, 1.5252]]
|
||||||
|
)
|
||||||
|
elif model_name == "depth-anything-v2-metric-outdoor-small":
|
||||||
|
expected_slice = torch.tensor(
|
||||||
|
[[9.5804, 8.0339, 7.7386], [7.9890, 7.2464, 7.7149], [7.7021, 7.2330, 7.3304]]
|
||||||
|
)
|
||||||
|
elif model_name == "depth-anything-v2-metric-outdoor-base":
|
||||||
|
expected_slice = torch.tensor(
|
||||||
|
[[10.2916, 9.0933, 8.8622], [9.1964, 9.3393, 9.0644], [8.9618, 9.4201, 9.2262]]
|
||||||
|
)
|
||||||
|
elif model_name == "depth-anything-v2-metric-outdoor-large":
|
||||||
|
expected_slice = torch.tensor(
|
||||||
|
[[14.0137, 13.3627, 13.1080], [13.2522, 13.3943, 13.3705], [13.0581, 13.4505, 13.3925]]
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError("Not supported")
|
raise ValueError("Not supported")
|
||||||
|
|
||||||
|
|||||||
@@ -54,7 +54,6 @@ DEPTH_ANYTHING_INPUTS_DOCSTRING = r"""
|
|||||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||||
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
|
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
|
||||||
for details.
|
for details.
|
||||||
|
|
||||||
output_attentions (`bool`, *optional*):
|
output_attentions (`bool`, *optional*):
|
||||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||||
tensors for more detail.
|
tensors for more detail.
|
||||||
@@ -318,7 +317,8 @@ class DepthAnythingDepthEstimationHead(nn.Module):
|
|||||||
"""
|
"""
|
||||||
Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
|
Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
|
||||||
the predictions to the input resolution after the first convolutional layer (details can be found in the DPT paper's
|
the predictions to the input resolution after the first convolutional layer (details can be found in the DPT paper's
|
||||||
supplementary material).
|
supplementary material). The final activation function is either ReLU or Sigmoid, depending on the depth estimation
|
||||||
|
type (relative or metric). For metric depth estimation, the output is scaled by the maximum depth used during pretraining.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
@@ -332,7 +332,13 @@ class DepthAnythingDepthEstimationHead(nn.Module):
|
|||||||
self.conv2 = nn.Conv2d(features // 2, config.head_hidden_size, kernel_size=3, stride=1, padding=1)
|
self.conv2 = nn.Conv2d(features // 2, config.head_hidden_size, kernel_size=3, stride=1, padding=1)
|
||||||
self.activation1 = nn.ReLU()
|
self.activation1 = nn.ReLU()
|
||||||
self.conv3 = nn.Conv2d(config.head_hidden_size, 1, kernel_size=1, stride=1, padding=0)
|
self.conv3 = nn.Conv2d(config.head_hidden_size, 1, kernel_size=1, stride=1, padding=0)
|
||||||
self.activation2 = nn.ReLU()
|
if config.depth_estimation_type == "relative":
|
||||||
|
self.activation2 = nn.ReLU()
|
||||||
|
elif config.depth_estimation_type == "metric":
|
||||||
|
self.activation2 = nn.Sigmoid()
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown depth estimation type: {config.depth_estimation_type}")
|
||||||
|
self.max_depth = config.max_depth
|
||||||
|
|
||||||
def forward(self, hidden_states: List[torch.Tensor], patch_height, patch_width) -> torch.Tensor:
|
def forward(self, hidden_states: List[torch.Tensor], patch_height, patch_width) -> torch.Tensor:
|
||||||
hidden_states = hidden_states[self.head_in_index]
|
hidden_states = hidden_states[self.head_in_index]
|
||||||
@@ -347,7 +353,7 @@ class DepthAnythingDepthEstimationHead(nn.Module):
|
|||||||
predicted_depth = self.conv2(predicted_depth)
|
predicted_depth = self.conv2(predicted_depth)
|
||||||
predicted_depth = self.activation1(predicted_depth)
|
predicted_depth = self.activation1(predicted_depth)
|
||||||
predicted_depth = self.conv3(predicted_depth)
|
predicted_depth = self.conv3(predicted_depth)
|
||||||
predicted_depth = self.activation2(predicted_depth)
|
predicted_depth = self.activation2(predicted_depth) * self.max_depth
|
||||||
predicted_depth = predicted_depth.squeeze(dim=1) # shape (batch_size, height, width)
|
predicted_depth = predicted_depth.squeeze(dim=1) # shape (batch_size, height, width)
|
||||||
|
|
||||||
return predicted_depth
|
return predicted_depth
|
||||||
|
|||||||
@@ -246,6 +246,7 @@ def prepare_img():
|
|||||||
@slow
|
@slow
|
||||||
class DepthAnythingModelIntegrationTest(unittest.TestCase):
|
class DepthAnythingModelIntegrationTest(unittest.TestCase):
|
||||||
def test_inference(self):
|
def test_inference(self):
|
||||||
|
# -- `relative` depth model --
|
||||||
image_processor = DPTImageProcessor.from_pretrained("LiheYoung/depth-anything-small-hf")
|
image_processor = DPTImageProcessor.from_pretrained("LiheYoung/depth-anything-small-hf")
|
||||||
model = DepthAnythingForDepthEstimation.from_pretrained("LiheYoung/depth-anything-small-hf").to(torch_device)
|
model = DepthAnythingForDepthEstimation.from_pretrained("LiheYoung/depth-anything-small-hf").to(torch_device)
|
||||||
|
|
||||||
@@ -265,4 +266,27 @@ class DepthAnythingModelIntegrationTest(unittest.TestCase):
|
|||||||
[[8.8204, 8.6468, 8.6195], [8.3313, 8.6027, 8.7526], [8.6526, 8.6866, 8.7453]],
|
[[8.8204, 8.6468, 8.6195], [8.3313, 8.6027, 8.7526], [8.6526, 8.6866, 8.7453]],
|
||||||
).to(torch_device)
|
).to(torch_device)
|
||||||
|
|
||||||
self.assertTrue(torch.allclose(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-6))
|
self.assertTrue(torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-6))
|
||||||
|
|
||||||
|
# -- `metric` depth model --
|
||||||
|
image_processor = DPTImageProcessor.from_pretrained("depth-anything/depth-anything-V2-metric-indoor-small-hf")
|
||||||
|
model = DepthAnythingForDepthEstimation.from_pretrained(
|
||||||
|
"depth-anything/depth-anything-V2-metric-indoor-small-hf"
|
||||||
|
).to(torch_device)
|
||||||
|
|
||||||
|
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||||
|
|
||||||
|
# forward pass
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = model(**inputs)
|
||||||
|
predicted_depth = outputs.predicted_depth
|
||||||
|
|
||||||
|
# verify the predicted depth
|
||||||
|
expected_shape = torch.Size([1, 518, 686])
|
||||||
|
self.assertEqual(predicted_depth.shape, expected_shape)
|
||||||
|
|
||||||
|
expected_slice = torch.tensor(
|
||||||
|
[[1.3349, 1.2946, 1.2801], [1.2793, 1.2337, 1.2899], [1.2629, 1.2218, 1.2476]],
|
||||||
|
).to(torch_device)
|
||||||
|
|
||||||
|
self.assertTrue(torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-4))
|
||||||
|
|||||||
Reference in New Issue
Block a user