Add Depth Anything V2 Metric models (#32126)

* add checkpoint and repo names * adapt head to support metric depth estimation * add max_depth output scaling * add expected logits * improve docs * fix docstring * add checkpoint and repo names * adapt head to support metric depth estimation * add max_depth output scaling * add expected logits * improve docs * fix docstring * rename depth_estimation to depth_estimation_type * add integration test * Refactored tests to include metric depth model inference test * Integration test pass when the timm backbone lines are commented (L220-L227) * address feedback * replace model path to use organization path * formatting * delete deprecated TODO * address feedback * [run_slow] depth_anything
2024-08-13 10:16:30 -04:00
parent 481e15604a
commit cc25757a44
4 changed files with 92 additions and 6 deletions
--- a/src/transformers/models/depth_anything/configuration_depth_anything.py
+++ b/src/transformers/models/depth_anything/configuration_depth_anything.py
@@ -27,7 +27,7 @@ logger = logging.get_logger(__name__)

 class DepthAnythingConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a [`DepthAnythingModel`]. It is used to instantiate an DepthAnything
+    This is the configuration class to store the configuration of a [`DepthAnythingModel`]. It is used to instantiate a DepthAnything
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the DepthAnything
    [LiheYoung/depth-anything-small-hf](https://huggingface.co/LiheYoung/depth-anything-small-hf) architecture.
@@ -67,6 +67,11 @@ class DepthAnythingConfig(PretrainedConfig):
            The index of the features to use in the depth estimation head.
        head_hidden_size (`int`, *optional*, defaults to 32):
            The number of output channels in the second convolution of the depth estimation head.
+        depth_estimation_type (`str`, *optional*, defaults to `"relative"`):
+            The type of depth estimation to use. Can be one of `["relative", "metric"]`.
+        max_depth (`float`, *optional*):
+            The maximum depth to use for the "metric" depth estimation head. 20 should be used for indoor models
+            and 80 for outdoor models. For "relative" depth estimation, this value is ignored.

    Example:

@@ -100,6 +105,8 @@ class DepthAnythingConfig(PretrainedConfig):
        fusion_hidden_size=64,
        head_in_index=-1,
        head_hidden_size=32,
+        depth_estimation_type="relative",
+        max_depth=None,
        **kwargs,
    ):
        super().__init__(**kwargs)
@@ -139,6 +146,10 @@ class DepthAnythingConfig(PretrainedConfig):
        self.fusion_hidden_size = fusion_hidden_size
        self.head_in_index = head_in_index
        self.head_hidden_size = head_hidden_size
+        if depth_estimation_type not in ["relative", "metric"]:
+            raise ValueError("depth_estimation_type must be one of ['relative', 'metric']")
+        self.depth_estimation_type = depth_estimation_type
+        self.max_depth = max_depth if max_depth else 1

    def to_dict(self):
        """
--- a/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py
+++ b/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py
@@ -56,12 +56,21 @@ def get_dpt_config(model_name):
    else:
        raise NotImplementedError(f"Model not supported: {model_name}")

+    if "metric" in model_name:
+        depth_estimation_type = "metric"
+        max_depth = 20 if "indoor" in model_name else 80
+    else:
+        depth_estimation_type = "relative"
+        max_depth = None
+
    config = DepthAnythingConfig(
        reassemble_hidden_size=backbone_config.hidden_size,
        patch_size=backbone_config.patch_size,
        backbone_config=backbone_config,
        fusion_hidden_size=fusion_hidden_size,
        neck_hidden_sizes=neck_hidden_sizes,
+        depth_estimation_type=depth_estimation_type,
+        max_depth=max_depth,
    )

    return config
@@ -178,6 +187,12 @@ name_to_checkpoint = {
    "depth-anything-v2-small": "depth_anything_v2_vits.pth",
    "depth-anything-v2-base": "depth_anything_v2_vitb.pth",
    "depth-anything-v2-large": "depth_anything_v2_vitl.pth",
+    "depth-anything-v2-metric-indoor-small": "depth_anything_v2_metric_hypersim_vits.pth",
+    "depth-anything-v2-metric-indoor-base": "depth_anything_v2_metric_hypersim_vitb.pth",
+    "depth-anything-v2-metric-indoor-large": "depth_anything_v2_metric_hypersim_vitl.pth",
+    "depth-anything-v2-metric-outdoor-small": "depth_anything_v2_metric_vkitti_vits.pth",
+    "depth-anything-v2-metric-outdoor-base": "depth_anything_v2_metric_vkitti_vitb.pth",
+    "depth-anything-v2-metric-outdoor-large": "depth_anything_v2_metric_vkitti_vitl.pth",
    # v2-giant pending
 }

@@ -198,6 +213,12 @@ def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, ve
        "depth-anything-v2-small": "depth-anything/Depth-Anything-V2-Small",
        "depth-anything-v2-base": "depth-anything/Depth-Anything-V2-Base",
        "depth-anything-v2-large": "depth-anything/Depth-Anything-V2-Large",
+        "depth-anything-v2-metric-indoor-small": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Small",
+        "depth-anything-v2-metric-indoor-base": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Base",
+        "depth-anything-v2-metric-indoor-large": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Large",
+        "depth-anything-v2-metric-outdoor-small": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Small",
+        "depth-anything-v2-metric-outdoor-base": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Base",
+        "depth-anything-v2-metric-outdoor-large": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Large",
    }

    # load original state_dict
@@ -272,6 +293,30 @@ def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, ve
            expected_slice = torch.tensor(
                [[162.2751, 161.8504, 162.8788], [160.3138, 160.8050, 161.9835], [159.3812, 159.9884, 160.0768]]
            )
+        elif model_name == "depth-anything-v2-metric-indoor-small":
+            expected_slice = torch.tensor(
+                [[1.3349, 1.2946, 1.2801], [1.2793, 1.2337, 1.2899], [1.2629, 1.2218, 1.2476]]
+            )
+        elif model_name == "depth-anything-v2-metric-indoor-base":
+            expected_slice = torch.tensor(
+                [[1.4601, 1.3824, 1.4904], [1.5031, 1.4349, 1.4274], [1.4570, 1.4578, 1.4200]]
+            )
+        elif model_name == "depth-anything-v2-metric-indoor-large":
+            expected_slice = torch.tensor(
+                [[1.5040, 1.5019, 1.5218], [1.5087, 1.5195, 1.5149], [1.5437, 1.5128, 1.5252]]
+            )
+        elif model_name == "depth-anything-v2-metric-outdoor-small":
+            expected_slice = torch.tensor(
+                [[9.5804, 8.0339, 7.7386], [7.9890, 7.2464, 7.7149], [7.7021, 7.2330, 7.3304]]
+            )
+        elif model_name == "depth-anything-v2-metric-outdoor-base":
+            expected_slice = torch.tensor(
+                [[10.2916, 9.0933, 8.8622], [9.1964, 9.3393, 9.0644], [8.9618, 9.4201, 9.2262]]
+            )
+        elif model_name == "depth-anything-v2-metric-outdoor-large":
+            expected_slice = torch.tensor(
+                [[14.0137, 13.3627, 13.1080], [13.2522, 13.3943, 13.3705], [13.0581, 13.4505, 13.3925]]
+            )
        else:
            raise ValueError("Not supported")

--- a/src/transformers/models/depth_anything/modeling_depth_anything.py
+++ b/src/transformers/models/depth_anything/modeling_depth_anything.py
@@ -54,7 +54,6 @@ DEPTH_ANYTHING_INPUTS_DOCSTRING = r"""
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
            for details.
-
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
@@ -318,7 +317,8 @@ class DepthAnythingDepthEstimationHead(nn.Module):
    """
    Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
    the predictions to the input resolution after the first convolutional layer (details can be found in the DPT paper's
-    supplementary material).
+    supplementary material). The final activation function is either ReLU or Sigmoid, depending on the depth estimation
+    type (relative or metric). For metric depth estimation, the output is scaled by the maximum depth used during pretraining.
    """

    def __init__(self, config):
@@ -332,7 +332,13 @@ class DepthAnythingDepthEstimationHead(nn.Module):
        self.conv2 = nn.Conv2d(features // 2, config.head_hidden_size, kernel_size=3, stride=1, padding=1)
        self.activation1 = nn.ReLU()
        self.conv3 = nn.Conv2d(config.head_hidden_size, 1, kernel_size=1, stride=1, padding=0)
-        self.activation2 = nn.ReLU()
+        if config.depth_estimation_type == "relative":
+            self.activation2 = nn.ReLU()
+        elif config.depth_estimation_type == "metric":
+            self.activation2 = nn.Sigmoid()
+        else:
+            raise ValueError(f"Unknown depth estimation type: {config.depth_estimation_type}")
+        self.max_depth = config.max_depth

    def forward(self, hidden_states: List[torch.Tensor], patch_height, patch_width) -> torch.Tensor:
        hidden_states = hidden_states[self.head_in_index]
@@ -347,7 +353,7 @@ class DepthAnythingDepthEstimationHead(nn.Module):
        predicted_depth = self.conv2(predicted_depth)
        predicted_depth = self.activation1(predicted_depth)
        predicted_depth = self.conv3(predicted_depth)
-        predicted_depth = self.activation2(predicted_depth)
+        predicted_depth = self.activation2(predicted_depth) * self.max_depth
        predicted_depth = predicted_depth.squeeze(dim=1)  # shape (batch_size, height, width)

        return predicted_depth
--- a/tests/models/depth_anything/test_modeling_depth_anything.py
+++ b/tests/models/depth_anything/test_modeling_depth_anything.py
@@ -246,6 +246,7 @@ def prepare_img():
@slow
 class DepthAnythingModelIntegrationTest(unittest.TestCase):
    def test_inference(self):
+        # -- `relative` depth model --
        image_processor = DPTImageProcessor.from_pretrained("LiheYoung/depth-anything-small-hf")
        model = DepthAnythingForDepthEstimation.from_pretrained("LiheYoung/depth-anything-small-hf").to(torch_device)

@@ -265,4 +266,27 @@ class DepthAnythingModelIntegrationTest(unittest.TestCase):
            [[8.8204, 8.6468, 8.6195], [8.3313, 8.6027, 8.7526], [8.6526, 8.6866, 8.7453]],
        ).to(torch_device)

-        self.assertTrue(torch.allclose(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-6))
+        self.assertTrue(torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-6))
+
+        # -- `metric` depth model --
+        image_processor = DPTImageProcessor.from_pretrained("depth-anything/depth-anything-V2-metric-indoor-small-hf")
+        model = DepthAnythingForDepthEstimation.from_pretrained(
+            "depth-anything/depth-anything-V2-metric-indoor-small-hf"
+        ).to(torch_device)
+
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+            predicted_depth = outputs.predicted_depth
+
+        # verify the predicted depth
+        expected_shape = torch.Size([1, 518, 686])
+        self.assertEqual(predicted_depth.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[1.3349, 1.2946, 1.2801], [1.2793, 1.2337, 1.2899], [1.2629, 1.2218, 1.2476]],
+        ).to(torch_device)
+
+        self.assertTrue(torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-4))