From 7e813f9cf0d66a8340c19ea56be964e5f1b440a8 Mon Sep 17 00:00:00 2001
From: Parteek <parteekkamboj112@gmail.com>
Date: Thu, 27 Mar 2025 18:40:03 +0530
Subject: [PATCH] Add Distill Any Depth (#36614)

* Added conversion Script

* Update src/transformers/models/depth_anything/convert_distill_any_depth_to_hf.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>

* Updated Conversion Script

* Update src/transformers/models/depth_anything/convert_distill_any_depth_to_hf.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>

---------

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 .../convert_distill_any_depth_to_hf.py        | 246 ++++++++++++++++++
 1 file changed, 246 insertions(+)
 create mode 100644 src/transformers/models/depth_anything/convert_distill_any_depth_to_hf.py

diff --git a/src/transformers/models/depth_anything/convert_distill_any_depth_to_hf.py b/src/transformers/models/depth_anything/convert_distill_any_depth_to_hf.py
new file mode 100644
index 0000000000..3dc8db2e97
--- /dev/null
+++ b/src/transformers/models/depth_anything/convert_distill_any_depth_to_hf.py
@@ -0,0 +1,246 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Distill Any Depth checkpoints from the original repository. URL:
+https://github.com/Westlake-AGI-Lab/Distill-Any-Depth"""
+
+import argparse
+import re
+from pathlib import Path
+
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from safetensors.torch import load_file
+
+from transformers import DepthAnythingConfig, DepthAnythingForDepthEstimation, Dinov2Config, DPTImageProcessor
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
+    r"(backbone|pretrained)\.cls_token": r"backbone.embeddings.cls_token",
+    r"(backbone|pretrained)\.mask_token": r"backbone.embeddings.mask_token",
+    r"(backbone|pretrained)\.pos_embed": r"backbone.embeddings.position_embeddings",
+    r"(backbone|pretrained)\.patch_embed\.proj\.(weight|bias)": r"backbone.embeddings.patch_embeddings.projection.\2",
+    r"(backbone|pretrained)\.norm\.(weight|bias)": r"backbone.layernorm.\2",
+    r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.attn\.proj\.(weight|bias)": r"backbone.encoder.layer.\4.attention.output.dense.\5",
+    r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.ls(1|2)\.gamma": r"backbone.encoder.layer.\4.layer_scale\5.lambda1",
+    r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.mlp\.fc(1|2)\.(weight|bias)": r"backbone.encoder.layer.\4.mlp.fc\5.\6",
+    r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.norm(1|2)\.(weight|bias)": r"backbone.encoder.layer.\4.norm\5.\6",
+    r"depth_head\.projects\.(\d+)\.(weight|bias)": r"neck.reassemble_stage.layers.\1.projection.\2",
+    r"depth_head\.resize_layers\.(?!2)(\d+)\.(weight|bias)": r"neck.reassemble_stage.layers.\1.resize.\2",
+    r"depth_head\.scratch\.layer(\d+)_rn\.weight": lambda m: f"neck.convs.{int(m[1])-1}.weight",
+    r"depth_head\.scratch\.output_conv(\d+)(?:\.(\d+))?\.(weight|bias)": lambda m: (
+        f"head.conv{int(m[1]) + (int(m[2])//2 if m[2] else 0)}.{m[3]}" if m[1] == "2" else f"head.conv{m[1]}.{m[3]}"
+    ),
+    r"depth_head\.scratch\.refinenet(\d+)\.out_conv\.(weight|bias)": lambda m: f"neck.fusion_stage.layers.{3 - (int(m[1])-1)}.projection.{m[2]}",
+    r"depth_head\.scratch\.refinenet(\d+)\.resConfUnit(\d+)\.conv(\d+)\.(weight|bias)": lambda m: f"neck.fusion_stage.layers.{3 - (int(m[1])-1)}.residual_layer{m[2]}.convolution{m[3]}.{m[4]}",
+}
+
+
+def get_dpt_config(model_name):
+    if "small" in model_name:
+        out_indices = [3, 6, 9, 12]
+        backbone_config = Dinov2Config.from_pretrained(
+            "facebook/dinov2-small", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False
+        )
+        fusion_hidden_size = 64
+        neck_hidden_sizes = [48, 96, 192, 384]
+    elif "base" in model_name:
+        out_indices = [3, 6, 9, 12]
+        backbone_config = Dinov2Config.from_pretrained(
+            "facebook/dinov2-base", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False
+        )
+        fusion_hidden_size = 128
+        neck_hidden_sizes = [96, 192, 384, 768]
+    elif "large" in model_name:
+        out_indices = [5, 12, 18, 24]
+        backbone_config = Dinov2Config.from_pretrained(
+            "facebook/dinov2-large", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False
+        )
+        fusion_hidden_size = 256
+        neck_hidden_sizes = [256, 512, 1024, 1024]
+    else:
+        raise NotImplementedError(f"Model not supported: {model_name}")
+
+    depth_estimation_type = "relative"
+    max_depth = None
+
+    config = DepthAnythingConfig(
+        reassemble_hidden_size=backbone_config.hidden_size,
+        patch_size=backbone_config.patch_size,
+        backbone_config=backbone_config,
+        fusion_hidden_size=fusion_hidden_size,
+        neck_hidden_sizes=neck_hidden_sizes,
+        depth_estimation_type=depth_estimation_type,
+        max_depth=max_depth,
+    )
+
+    return config
+
+
+def convert_key_pattern(key, mapping):
+    for pattern, replacement in mapping.items():
+        match = re.fullmatch(pattern, key)
+        if match:
+            if callable(replacement):
+                return replacement(match)
+            return re.sub(pattern, replacement, key)
+    return None
+
+
+def convert_keys(state_dict, config):
+    new_state_dict = {}
+    qkv_pattern = r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.attn\.qkv\.(weight|bias)"
+    qkv_keys = [k for k in list(state_dict.keys()) if re.match(qkv_pattern, k)]
+    for old_key in qkv_keys:
+        value = state_dict.pop(old_key)
+        match = re.match(qkv_pattern, old_key)
+        _, _, _, layer, attr = match.groups()
+        hidden_size = config.backbone_config.hidden_size
+        q = value[:hidden_size]
+        k = value[hidden_size : hidden_size * 2]
+        v = value[-hidden_size:]
+
+        for proj, tensor in zip(["query", "key", "value"], [q, k, v]):
+            new_key = f"backbone.encoder.layer.{layer}.attention.attention.{proj}.{attr}"
+            new_state_dict[new_key] = tensor
+
+    for old_key in list(state_dict.keys()):
+        value = state_dict.pop(old_key)
+        new_key = convert_key_pattern(old_key, ORIGINAL_TO_CONVERTED_KEY_MAPPING)
+
+        new_state_dict[new_key] = value
+
+    return new_state_dict
+
+
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    return Image.open(requests.get(url, stream=True).raw)
+
+
+name_to_checkpoint = {
+    "distill-any-depth-small": "small/model.safetensors",
+    "distill-any-depth-base": "base/model.safetensors",
+    "distill-any-depth-large": "large/model.safetensors",
+}
+
+
+@torch.no_grad()
+def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, verify_logits):
+    config = get_dpt_config(model_name)
+
+    repo_id = "xingyang1/Distill-Any-Depth"
+    filepath = hf_hub_download(repo_id=repo_id, filename=name_to_checkpoint[model_name])
+    state_dict = load_file(filepath)
+
+    converted_state_dict = convert_keys(state_dict, config)
+
+    model = DepthAnythingForDepthEstimation(config)
+    model.load_state_dict(converted_state_dict)
+    model.eval()
+
+    processor = DPTImageProcessor(
+        do_resize=True,
+        size={"height": 518, "width": 518},
+        ensure_multiple_of=14,
+        keep_aspect_ratio=True,
+        do_rescale=True,
+        do_normalize=True,
+        image_mean=[0.485, 0.456, 0.406],
+        image_std=[0.229, 0.224, 0.225],
+    )
+
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+
+    pixel_values = processor(image, return_tensors="pt").pixel_values
+
+    with torch.no_grad():
+        outputs = model(pixel_values)
+        predicted_depth = outputs.predicted_depth
+
+    print("Shape of predicted depth:", predicted_depth.shape)
+    print("First values:", predicted_depth[0, :3, :3])
+
+    if verify_logits:
+        print("Verifying logits...")
+        expected_shape = torch.Size([1, 518, 686])
+
+        if model_name == "distill-any-depth-small":
+            expected_slice = torch.tensor(
+                [[2.5653, 2.5249, 2.5570], [2.4897, 2.5235, 2.5355], [2.5255, 2.5261, 2.5422]]
+            )
+        elif model_name == "distill-any-depth-base":
+            expected_slice = torch.tensor(
+                [[4.8976, 4.9075, 4.9403], [4.8872, 4.8906, 4.9448], [4.8712, 4.8898, 4.8838]]
+            )
+        elif model_name == "distill-any-depth-large":
+            expected_slice = torch.tensor(
+                [[55.1067, 51.1828, 51.6803], [51.9098, 50.7529, 51.4494], [50.1745, 50.5491, 50.8818]]
+            )
+        else:
+            raise ValueError("Not supported")
+
+        assert predicted_depth.shape == torch.Size(expected_shape)
+        assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-4)
+        print("Looks ok!")
+
+    if pytorch_dump_folder_path is not None:
+        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+        print(f"Saving model and processor to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+        processor.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        print("Pushing model and processor to hub...")
+        model.push_to_hub(repo_id=f"{model_name.title()}-hf")
+        processor.push_to_hub(repo_id=f"{model_name.title()}-hf")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name",
+        default="distill-any-depth-small",
+        type=str,
+        choices=name_to_checkpoint.keys(),
+        help="Name of the model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=str,
+        help="Path to the output PyTorch model directory.",
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        help="Whether to push the model to the hub after conversion.",
+    )
+    parser.add_argument(
+        "--verify_logits",
+        action="store_true",
+        required=False,
+        help="Whether to verify the logits after conversion.",
+    )
+
+    args = parser.parse_args()
+    convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits)