From cc25757a44d08cc7e3cca32554455cbcfc5db957 Mon Sep 17 00:00:00 2001 From: Bertrand Thia <56003053+bt2513@users.noreply.github.com> Date: Tue, 13 Aug 2024 10:16:30 -0400 Subject: [PATCH] Add Depth Anything V2 Metric models (#32126) * add checkpoint and repo names * adapt head to support metric depth estimation * add max_depth output scaling * add expected logits * improve docs * fix docstring * add checkpoint and repo names * adapt head to support metric depth estimation * add max_depth output scaling * add expected logits * improve docs * fix docstring * rename depth_estimation to depth_estimation_type * add integration test * Refactored tests to include metric depth model inference test * Integration test pass when the timm backbone lines are commented (L220-L227) * address feedback * replace model path to use organization path * formatting * delete deprecated TODO * address feedback * [run_slow] depth_anything --- .../configuration_depth_anything.py | 13 +++++- .../convert_depth_anything_to_hf.py | 45 +++++++++++++++++++ .../depth_anything/modeling_depth_anything.py | 14 ++++-- .../test_modeling_depth_anything.py | 26 ++++++++++- 4 files changed, 92 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/depth_anything/configuration_depth_anything.py b/src/transformers/models/depth_anything/configuration_depth_anything.py index 78ccbc381d..e1b472bdce 100644 --- a/src/transformers/models/depth_anything/configuration_depth_anything.py +++ b/src/transformers/models/depth_anything/configuration_depth_anything.py @@ -27,7 +27,7 @@ logger = logging.get_logger(__name__) class DepthAnythingConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`DepthAnythingModel`]. It is used to instantiate an DepthAnything + This is the configuration class to store the configuration of a [`DepthAnythingModel`]. It is used to instantiate a DepthAnything model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the DepthAnything [LiheYoung/depth-anything-small-hf](https://huggingface.co/LiheYoung/depth-anything-small-hf) architecture. @@ -67,6 +67,11 @@ class DepthAnythingConfig(PretrainedConfig): The index of the features to use in the depth estimation head. head_hidden_size (`int`, *optional*, defaults to 32): The number of output channels in the second convolution of the depth estimation head. + depth_estimation_type (`str`, *optional*, defaults to `"relative"`): + The type of depth estimation to use. Can be one of `["relative", "metric"]`. + max_depth (`float`, *optional*): + The maximum depth to use for the "metric" depth estimation head. 20 should be used for indoor models + and 80 for outdoor models. For "relative" depth estimation, this value is ignored. Example: @@ -100,6 +105,8 @@ class DepthAnythingConfig(PretrainedConfig): fusion_hidden_size=64, head_in_index=-1, head_hidden_size=32, + depth_estimation_type="relative", + max_depth=None, **kwargs, ): super().__init__(**kwargs) @@ -139,6 +146,10 @@ class DepthAnythingConfig(PretrainedConfig): self.fusion_hidden_size = fusion_hidden_size self.head_in_index = head_in_index self.head_hidden_size = head_hidden_size + if depth_estimation_type not in ["relative", "metric"]: + raise ValueError("depth_estimation_type must be one of ['relative', 'metric']") + self.depth_estimation_type = depth_estimation_type + self.max_depth = max_depth if max_depth else 1 def to_dict(self): """ diff --git a/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py b/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py index 3e45c95de9..5c6da13ae8 100644 --- a/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py +++ b/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py @@ -56,12 +56,21 @@ def get_dpt_config(model_name): else: raise NotImplementedError(f"Model not supported: {model_name}") + if "metric" in model_name: + depth_estimation_type = "metric" + max_depth = 20 if "indoor" in model_name else 80 + else: + depth_estimation_type = "relative" + max_depth = None + config = DepthAnythingConfig( reassemble_hidden_size=backbone_config.hidden_size, patch_size=backbone_config.patch_size, backbone_config=backbone_config, fusion_hidden_size=fusion_hidden_size, neck_hidden_sizes=neck_hidden_sizes, + depth_estimation_type=depth_estimation_type, + max_depth=max_depth, ) return config @@ -178,6 +187,12 @@ name_to_checkpoint = { "depth-anything-v2-small": "depth_anything_v2_vits.pth", "depth-anything-v2-base": "depth_anything_v2_vitb.pth", "depth-anything-v2-large": "depth_anything_v2_vitl.pth", + "depth-anything-v2-metric-indoor-small": "depth_anything_v2_metric_hypersim_vits.pth", + "depth-anything-v2-metric-indoor-base": "depth_anything_v2_metric_hypersim_vitb.pth", + "depth-anything-v2-metric-indoor-large": "depth_anything_v2_metric_hypersim_vitl.pth", + "depth-anything-v2-metric-outdoor-small": "depth_anything_v2_metric_vkitti_vits.pth", + "depth-anything-v2-metric-outdoor-base": "depth_anything_v2_metric_vkitti_vitb.pth", + "depth-anything-v2-metric-outdoor-large": "depth_anything_v2_metric_vkitti_vitl.pth", # v2-giant pending } @@ -198,6 +213,12 @@ def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, ve "depth-anything-v2-small": "depth-anything/Depth-Anything-V2-Small", "depth-anything-v2-base": "depth-anything/Depth-Anything-V2-Base", "depth-anything-v2-large": "depth-anything/Depth-Anything-V2-Large", + "depth-anything-v2-metric-indoor-small": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Small", + "depth-anything-v2-metric-indoor-base": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Base", + "depth-anything-v2-metric-indoor-large": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Large", + "depth-anything-v2-metric-outdoor-small": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Small", + "depth-anything-v2-metric-outdoor-base": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Base", + "depth-anything-v2-metric-outdoor-large": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Large", } # load original state_dict @@ -272,6 +293,30 @@ def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, ve expected_slice = torch.tensor( [[162.2751, 161.8504, 162.8788], [160.3138, 160.8050, 161.9835], [159.3812, 159.9884, 160.0768]] ) + elif model_name == "depth-anything-v2-metric-indoor-small": + expected_slice = torch.tensor( + [[1.3349, 1.2946, 1.2801], [1.2793, 1.2337, 1.2899], [1.2629, 1.2218, 1.2476]] + ) + elif model_name == "depth-anything-v2-metric-indoor-base": + expected_slice = torch.tensor( + [[1.4601, 1.3824, 1.4904], [1.5031, 1.4349, 1.4274], [1.4570, 1.4578, 1.4200]] + ) + elif model_name == "depth-anything-v2-metric-indoor-large": + expected_slice = torch.tensor( + [[1.5040, 1.5019, 1.5218], [1.5087, 1.5195, 1.5149], [1.5437, 1.5128, 1.5252]] + ) + elif model_name == "depth-anything-v2-metric-outdoor-small": + expected_slice = torch.tensor( + [[9.5804, 8.0339, 7.7386], [7.9890, 7.2464, 7.7149], [7.7021, 7.2330, 7.3304]] + ) + elif model_name == "depth-anything-v2-metric-outdoor-base": + expected_slice = torch.tensor( + [[10.2916, 9.0933, 8.8622], [9.1964, 9.3393, 9.0644], [8.9618, 9.4201, 9.2262]] + ) + elif model_name == "depth-anything-v2-metric-outdoor-large": + expected_slice = torch.tensor( + [[14.0137, 13.3627, 13.1080], [13.2522, 13.3943, 13.3705], [13.0581, 13.4505, 13.3925]] + ) else: raise ValueError("Not supported") diff --git a/src/transformers/models/depth_anything/modeling_depth_anything.py b/src/transformers/models/depth_anything/modeling_depth_anything.py index e37f0a3eaf..e24b38be64 100644 --- a/src/transformers/models/depth_anything/modeling_depth_anything.py +++ b/src/transformers/models/depth_anything/modeling_depth_anything.py @@ -54,7 +54,6 @@ DEPTH_ANYTHING_INPUTS_DOCSTRING = r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`] for details. - output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. @@ -318,7 +317,8 @@ class DepthAnythingDepthEstimationHead(nn.Module): """ Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples the predictions to the input resolution after the first convolutional layer (details can be found in the DPT paper's - supplementary material). + supplementary material). The final activation function is either ReLU or Sigmoid, depending on the depth estimation + type (relative or metric). For metric depth estimation, the output is scaled by the maximum depth used during pretraining. """ def __init__(self, config): @@ -332,7 +332,13 @@ class DepthAnythingDepthEstimationHead(nn.Module): self.conv2 = nn.Conv2d(features // 2, config.head_hidden_size, kernel_size=3, stride=1, padding=1) self.activation1 = nn.ReLU() self.conv3 = nn.Conv2d(config.head_hidden_size, 1, kernel_size=1, stride=1, padding=0) - self.activation2 = nn.ReLU() + if config.depth_estimation_type == "relative": + self.activation2 = nn.ReLU() + elif config.depth_estimation_type == "metric": + self.activation2 = nn.Sigmoid() + else: + raise ValueError(f"Unknown depth estimation type: {config.depth_estimation_type}") + self.max_depth = config.max_depth def forward(self, hidden_states: List[torch.Tensor], patch_height, patch_width) -> torch.Tensor: hidden_states = hidden_states[self.head_in_index] @@ -347,7 +353,7 @@ class DepthAnythingDepthEstimationHead(nn.Module): predicted_depth = self.conv2(predicted_depth) predicted_depth = self.activation1(predicted_depth) predicted_depth = self.conv3(predicted_depth) - predicted_depth = self.activation2(predicted_depth) + predicted_depth = self.activation2(predicted_depth) * self.max_depth predicted_depth = predicted_depth.squeeze(dim=1) # shape (batch_size, height, width) return predicted_depth diff --git a/tests/models/depth_anything/test_modeling_depth_anything.py b/tests/models/depth_anything/test_modeling_depth_anything.py index d78671d559..0e59bc4d13 100644 --- a/tests/models/depth_anything/test_modeling_depth_anything.py +++ b/tests/models/depth_anything/test_modeling_depth_anything.py @@ -246,6 +246,7 @@ def prepare_img(): @slow class DepthAnythingModelIntegrationTest(unittest.TestCase): def test_inference(self): + # -- `relative` depth model -- image_processor = DPTImageProcessor.from_pretrained("LiheYoung/depth-anything-small-hf") model = DepthAnythingForDepthEstimation.from_pretrained("LiheYoung/depth-anything-small-hf").to(torch_device) @@ -265,4 +266,27 @@ class DepthAnythingModelIntegrationTest(unittest.TestCase): [[8.8204, 8.6468, 8.6195], [8.3313, 8.6027, 8.7526], [8.6526, 8.6866, 8.7453]], ).to(torch_device) - self.assertTrue(torch.allclose(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-6)) + self.assertTrue(torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-6)) + + # -- `metric` depth model -- + image_processor = DPTImageProcessor.from_pretrained("depth-anything/depth-anything-V2-metric-indoor-small-hf") + model = DepthAnythingForDepthEstimation.from_pretrained( + "depth-anything/depth-anything-V2-metric-indoor-small-hf" + ).to(torch_device) + + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + predicted_depth = outputs.predicted_depth + + # verify the predicted depth + expected_shape = torch.Size([1, 518, 686]) + self.assertEqual(predicted_depth.shape, expected_shape) + + expected_slice = torch.tensor( + [[1.3349, 1.2946, 1.2801], [1.2793, 1.2337, 1.2899], [1.2629, 1.2218, 1.2476]], + ).to(torch_device) + + self.assertTrue(torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-4))