From 2422c38de6dc2341f9cf3685fc36d090a68d0f60 Mon Sep 17 00:00:00 2001 From: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Date: Mon, 13 Nov 2023 17:20:42 +0100 Subject: [PATCH] Add DINOv2 depth estimation (#26092) * First draft * Fix style * More improvements * Fix tests * Fix tests * Convert checkpoint * Improve DPTImageProcessor * Remove scripts, improve conversion script * Remove print statements * Fix test * Improve docstring * More improvements * Fix style * Fix image processor * Add tests * Address comments * Address comments * Make bias backwards compatible * Address comment * Address comment * Address comment * Apply suggestions from code review Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Address comments * Add flag * Add tests * Make tests smaller * Use regular BackboneOutput * Fix all tests * Update test * Convert more checkpoints * Convert giant checkpoints, add integration test * Rename size_divisibility to size_divisor --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- .../models/dinov2/modeling_dinov2.py | 7 +- .../models/dpt/configuration_dpt.py | 58 ++- .../models/dpt/convert_dinov2_depth_to_hf.py | 384 ++++++++++++++++++ .../models/dpt/convert_dpt_to_pytorch.py | 15 +- .../models/dpt/image_processing_dpt.py | 70 +++- src/transformers/models/dpt/modeling_dpt.py | 153 ++++--- tests/models/dpt/test_image_processing_dpt.py | 22 + .../dpt/test_modeling_dpt_auto_backbone.py | 294 ++++++++++++++ 8 files changed, 928 insertions(+), 75 deletions(-) create mode 100644 src/transformers/models/dpt/convert_dinov2_depth_to_hf.py create mode 100644 tests/models/dpt/test_modeling_dpt_auto_backbone.py diff --git a/src/transformers/models/dinov2/modeling_dinov2.py b/src/transformers/models/dinov2/modeling_dinov2.py index 1215b23480..e6a17e5707 100644 --- a/src/transformers/models/dinov2/modeling_dinov2.py +++ b/src/transformers/models/dinov2/modeling_dinov2.py @@ -835,11 +835,12 @@ class Dinov2Backbone(Dinov2PreTrainedModel, BackboneMixin): if self.config.apply_layernorm: hidden_state = self.layernorm(hidden_state) if self.config.reshape_hidden_states: + hidden_state = hidden_state[:, 1:] + # this was actually a bug in the original implementation that we copied here, + # cause normally the order is height, width batch_size, _, height, width = pixel_values.shape patch_size = self.config.patch_size - hidden_state = hidden_state[:, 1:, :].reshape( - batch_size, width // patch_size, height // patch_size, -1 - ) + hidden_state = hidden_state.reshape(batch_size, height // patch_size, width // patch_size, -1) hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous() feature_maps += (hidden_state,) diff --git a/src/transformers/models/dpt/configuration_dpt.py b/src/transformers/models/dpt/configuration_dpt.py index 45acd5902f..99994c5cb2 100644 --- a/src/transformers/models/dpt/configuration_dpt.py +++ b/src/transformers/models/dpt/configuration_dpt.py @@ -18,6 +18,7 @@ import copy from ...configuration_utils import PretrainedConfig from ...utils import logging +from ..auto.configuration_auto import CONFIG_MAPPING from ..bit import BitConfig @@ -91,6 +92,10 @@ class DPTConfig(PretrainedConfig): The index of the features to use in the heads. use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`): Whether to use batch normalization in the pre-activate residual units of the fusion blocks. + use_bias_in_fusion_residual (`bool`, *optional*, defaults to `True`): + Whether to use bias in the pre-activate residual units of the fusion blocks. + add_projection (`bool`, *optional*, defaults to `False`): + Whether to add a projection layer before the depth estimation head. use_auxiliary_head (`bool`, *optional*, defaults to `True`): Whether to use an auxiliary head during training. auxiliary_loss_weight (`float`, *optional*, defaults to 0.4): @@ -104,7 +109,8 @@ class DPTConfig(PretrainedConfig): neck_ignore_stages (`List[int]`, *optional*, defaults to `[0, 1]`): Used only for the `hybrid` embedding type. The stages of the readout layers to ignore. backbone_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*): - Used only for the `hybrid` embedding type. The configuration of the backbone in a dictionary. + The configuration of the backbone model. Only used in case `is_hybrid` is `True` or in case you want to + leverage the [`AutoBackbone`] API. Example: @@ -145,6 +151,8 @@ class DPTConfig(PretrainedConfig): fusion_hidden_size=256, head_in_index=-1, use_batch_norm_in_fusion_residual=False, + use_bias_in_fusion_residual=None, + add_projection=False, use_auxiliary_head=True, auxiliary_loss_weight=0.4, semantic_loss_ignore_index=255, @@ -159,6 +167,7 @@ class DPTConfig(PretrainedConfig): self.hidden_size = hidden_size self.is_hybrid = is_hybrid + use_autobackbone = False if self.is_hybrid: if backbone_config is None: logger.info("Initializing the config with a `BiT` backbone.") @@ -185,32 +194,49 @@ class DPTConfig(PretrainedConfig): if readout_type != "project": raise ValueError("Readout type must be 'project' when using `DPT-hybrid` mode.") - else: - self.backbone_config = None + + elif backbone_config is not None: + use_autobackbone = True + + if isinstance(backbone_config, dict): + backbone_model_type = backbone_config.get("model_type") + config_class = CONFIG_MAPPING[backbone_model_type] + backbone_config = config_class.from_dict(backbone_config) + + self.backbone_config = backbone_config self.backbone_featmap_shape = None self.neck_ignore_stages = [] - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.qkv_bias = qkv_bias - self.backbone_out_indices = backbone_out_indices + else: + self.backbone_config = backbone_config + self.backbone_featmap_shape = None + self.neck_ignore_stages = [] + + self.num_hidden_layers = None if use_autobackbone else num_hidden_layers + self.num_attention_heads = None if use_autobackbone else num_attention_heads + self.intermediate_size = None if use_autobackbone else intermediate_size + self.hidden_dropout_prob = None if use_autobackbone else hidden_dropout_prob + self.attention_probs_dropout_prob = None if use_autobackbone else attention_probs_dropout_prob + self.layer_norm_eps = None if use_autobackbone else layer_norm_eps + self.image_size = None if use_autobackbone else image_size + self.patch_size = None if use_autobackbone else patch_size + self.num_channels = None if use_autobackbone else num_channels + self.qkv_bias = None if use_autobackbone else qkv_bias + self.backbone_out_indices = None if use_autobackbone else backbone_out_indices + if readout_type not in ["ignore", "add", "project"]: raise ValueError("Readout_type must be one of ['ignore', 'add', 'project']") + self.hidden_act = hidden_act + self.initializer_range = initializer_range self.readout_type = readout_type self.reassemble_factors = reassemble_factors self.neck_hidden_sizes = neck_hidden_sizes self.fusion_hidden_size = fusion_hidden_size self.head_in_index = head_in_index self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual + self.use_bias_in_fusion_residual = use_bias_in_fusion_residual + self.add_projection = add_projection + # auxiliary head attributes (semantic segmentation) self.use_auxiliary_head = use_auxiliary_head self.auxiliary_loss_weight = auxiliary_loss_weight diff --git a/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py b/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py new file mode 100644 index 0000000000..2bd147096c --- /dev/null +++ b/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py @@ -0,0 +1,384 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert DINOv2 + DPT checkpoints from the original repository. URL: +https://github.com/facebookresearch/dinov2/tree/main""" + + +import argparse +import itertools +import math +from pathlib import Path + +import requests +import torch +from PIL import Image +from torchvision import transforms + +from transformers import Dinov2Config, DPTConfig, DPTForDepthEstimation, DPTImageProcessor +from transformers.utils import logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + + +def get_dpt_config(model_name): + if "small" in model_name: + # equivalent to stage 3, stage 6, stage 9, stage 12 + backbone_config = Dinov2Config.from_pretrained( + "facebook/dinov2-small", out_indices=[3, 6, 9, 12], apply_layernorm=False, reshape_hidden_states=False + ) + neck_hidden_sizes = [48, 96, 192, 384] + elif "base" in model_name: + backbone_config = Dinov2Config.from_pretrained( + "facebook/dinov2-base", out_indices=[3, 6, 9, 12], apply_layernorm=False, reshape_hidden_states=False + ) + neck_hidden_sizes = [96, 192, 384, 768] + elif "large" in model_name: + backbone_config = Dinov2Config.from_pretrained( + "facebook/dinov2-large", out_indices=[5, 12, 18, 24], apply_layernorm=False, reshape_hidden_states=False + ) + neck_hidden_sizes = [128, 256, 512, 1024] + elif "giant" in model_name: + backbone_config = Dinov2Config.from_pretrained( + "facebook/dinov2-giant", out_indices=[10, 20, 30, 40], apply_layernorm=False, reshape_hidden_states=False + ) + neck_hidden_sizes = [192, 384, 768, 1536] + else: + raise NotImplementedError("To do") + + config = DPTConfig( + backbone_config=backbone_config, + neck_hidden_sizes=neck_hidden_sizes, + use_bias_in_fusion_residual=False, + add_projection=True, + ) + + return config + + +# here we list all DPT keys to be renamed (original name on the left, our name on the right) +def create_rename_keys_dpt(config): + rename_keys = [] + + # fmt: off + # activation postprocessing (projections, readout projections + resize blocks) + for i in range(4): + rename_keys.append((f"decode_head.reassemble_blocks.projects.{i}.conv.weight", f"neck.reassemble_stage.layers.{i}.projection.weight")) + rename_keys.append((f"decode_head.reassemble_blocks.projects.{i}.conv.bias", f"neck.reassemble_stage.layers.{i}.projection.bias")) + + rename_keys.append((f"decode_head.reassemble_blocks.readout_projects.{i}.0.weight", f"neck.reassemble_stage.readout_projects.{i}.0.weight")) + rename_keys.append((f"decode_head.reassemble_blocks.readout_projects.{i}.0.bias", f"neck.reassemble_stage.readout_projects.{i}.0.bias")) + + if i != 2: + rename_keys.append((f"decode_head.reassemble_blocks.resize_layers.{i}.weight", f"neck.reassemble_stage.layers.{i}.resize.weight")) + rename_keys.append((f"decode_head.reassemble_blocks.resize_layers.{i}.bias", f"neck.reassemble_stage.layers.{i}.resize.bias")) + + # fusion layers + for i in range(4): + rename_keys.append((f"decode_head.fusion_blocks.{i}.project.conv.weight", f"neck.fusion_stage.layers.{i}.projection.weight")) + rename_keys.append((f"decode_head.fusion_blocks.{i}.project.conv.bias", f"neck.fusion_stage.layers.{i}.projection.bias")) + if i != 0: + rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit1.conv1.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer1.convolution1.weight")) + rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit1.conv2.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer1.convolution2.weight")) + rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit2.conv1.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer2.convolution1.weight")) + rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit2.conv2.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer2.convolution2.weight")) + + # neck convolutions + for i in range(4): + rename_keys.append((f"decode_head.convs.{i}.conv.weight", f"neck.convs.{i}.weight")) + + # head + rename_keys.append(("decode_head.project.conv.weight", "head.projection.weight")) + rename_keys.append(("decode_head.project.conv.bias", "head.projection.bias")) + + for i in range(0, 5, 2): + rename_keys.append((f"decode_head.conv_depth.head.{i}.weight", f"head.head.{i}.weight")) + rename_keys.append((f"decode_head.conv_depth.head.{i}.bias", f"head.head.{i}.bias")) + # fmt: on + + return rename_keys + + +# here we list all backbone keys to be renamed (original name on the left, our name on the right) +def create_rename_keys_backbone(config): + rename_keys = [] + + # fmt: off + # patch embedding layer + rename_keys.append(("cls_token", "backbone.embeddings.cls_token")) + rename_keys.append(("mask_token", "backbone.embeddings.mask_token")) + rename_keys.append(("pos_embed", "backbone.embeddings.position_embeddings")) + rename_keys.append(("patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight")) + rename_keys.append(("patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias")) + + # Transfomer encoder + for i in range(config.backbone_config.num_hidden_layers): + # layernorms + rename_keys.append((f"blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.norm1.weight")) + rename_keys.append((f"blocks.{i}.norm1.bias", f"backbone.encoder.layer.{i}.norm1.bias")) + rename_keys.append((f"blocks.{i}.norm2.weight", f"backbone.encoder.layer.{i}.norm2.weight")) + rename_keys.append((f"blocks.{i}.norm2.bias", f"backbone.encoder.layer.{i}.norm2.bias")) + # MLP + if config.backbone_config.use_swiglu_ffn: + rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"backbone.encoder.layer.{i}.mlp.w12.weight")) + rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"backbone.encoder.layer.{i}.mlp.w12.bias")) + rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"backbone.encoder.layer.{i}.mlp.w3.weight")) + rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"backbone.encoder.layer.{i}.mlp.w3.bias")) + else: + rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"backbone.encoder.layer.{i}.mlp.fc1.weight")) + rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"backbone.encoder.layer.{i}.mlp.fc1.bias")) + rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"backbone.encoder.layer.{i}.mlp.fc2.weight")) + rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"backbone.encoder.layer.{i}.mlp.fc2.bias")) + # layerscale + rename_keys.append((f"blocks.{i}.ls1.gamma", f"backbone.encoder.layer.{i}.layer_scale1.lambda1")) + rename_keys.append((f"blocks.{i}.ls2.gamma", f"backbone.encoder.layer.{i}.layer_scale2.lambda1")) + # attention projection layer + rename_keys.append((f"blocks.{i}.attn.proj.weight", f"backbone.encoder.layer.{i}.attention.output.dense.weight")) + rename_keys.append((f"blocks.{i}.attn.proj.bias", f"backbone.encoder.layer.{i}.attention.output.dense.bias")) + # fmt: on + + rename_keys.append(("norm.weight", "backbone.layernorm.weight")) + rename_keys.append(("norm.bias", "backbone.layernorm.bias")) + + return rename_keys + + +# we split up the matrix of each encoder layer into queries, keys and values +def read_in_q_k_v(state_dict, config): + for i in range(config.backbone_config.num_hidden_layers): + # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) + in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight") + in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias") + hidden_size = config.backbone_config.hidden_size + # next, add query, keys and values (in that order) to the state dict + state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :] + state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[:hidden_size] + state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ + hidden_size : hidden_size * 2, : + ] + state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ + hidden_size : hidden_size * 2 + ] + state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :] + state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-hidden_size:] + + +def rename_key(dct, old, new): + val = dct.pop(old) + dct[new] = val + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "https://dl.fbaipublicfiles.com/dinov2/images/example.jpg" + im = Image.open(requests.get(url, stream=True).raw) + return im + + +name_to_url = { + "dpt-dinov2-small-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_nyu_dpt_head.pth", + "dpt-dinov2-small-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_kitti_dpt_head.pth", + "dpt-dinov2-base-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_nyu_dpt_head.pth", + "dpt-dinov2-base-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_kitti_dpt_head.pth", + "dpt-dinov2-large-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_nyu_dpt_head.pth", + "dpt-dinov2-large-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_kitti_dpt_head.pth", + "dpt-dinov2-giant-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_nyu_dpt_head.pth", + "dpt-dinov2-giant-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_kitti_dpt_head.pth", +} + + +def get_original_pixel_values(image): + class CenterPadding(object): + def __init__(self, multiple): + super().__init__() + self.multiple = multiple + + def _get_pad(self, size): + new_size = math.ceil(size / self.multiple) * self.multiple + pad_size = new_size - size + pad_size_left = pad_size // 2 + pad_size_right = pad_size - pad_size_left + return pad_size_left, pad_size_right + + def __call__(self, img): + pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in img.shape[-2:][::-1])) + output = torch.nn.functional.pad(img, pads) + return output + + def __repr__(self): + return self.__class__.__name__ + "()" + + def make_depth_transform() -> transforms.Compose: + return transforms.Compose( + [ + transforms.ToTensor(), + lambda x: 255.0 * x[:3], # Discard alpha component and scale by 255 + transforms.Normalize( + mean=(123.675, 116.28, 103.53), + std=(58.395, 57.12, 57.375), + ), + CenterPadding(multiple=14), + ] + ) + + transform = make_depth_transform() + original_pixel_values = transform(image).unsqueeze(0) + + return original_pixel_values + + +@torch.no_grad() +def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, verify_logits): + """ + Copy/paste/tweak model's weights to our DPT structure. + """ + + # define DPT configuration based on URL + checkpoint_url = name_to_url[model_name] + config = get_dpt_config(model_name) + + # load original DPT state_dict from URL + print("URL:", checkpoint_url) + dpt_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["state_dict"] + # rename keys + rename_keys = create_rename_keys_dpt(config) + for src, dest in rename_keys: + rename_key(dpt_state_dict, src, dest) + + # load original backbone state_dict from URL + if "small" in model_name: + original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vits14") + elif "base" in model_name: + original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitb14") + elif "large" in model_name: + original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitl14") + elif "giant" in model_name: + original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitg14") + else: + raise NotImplementedError("To do") + original_model.eval() + backbone_state_dict = original_model.state_dict() + + # rename keys + rename_keys = create_rename_keys_backbone(config) + for src, dest in rename_keys: + rename_key(backbone_state_dict, src, dest) + + # read in qkv matrices + read_in_q_k_v(backbone_state_dict, config) + + for key, val in backbone_state_dict.copy().items(): + val = backbone_state_dict.pop(key) + if "w12" in key: + key = key.replace("w12", "weights_in") + if "w3" in key: + key = key.replace("w3", "weights_out") + backbone_state_dict[key] = val + + # merge state_dicts + state_dict = {**backbone_state_dict, **dpt_state_dict} + + # load HuggingFace model + model = DPTForDepthEstimation(config) + missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) + print("Missing keys:", missing_keys) + print("Unexpected keys:", unexpected_keys) + assert missing_keys == [ + "neck.fusion_stage.layers.0.residual_layer1.convolution1.weight", + "neck.fusion_stage.layers.0.residual_layer1.convolution2.weight", + ] + model.eval() + + # Verify image processor + processor = DPTImageProcessor( + do_resize=False, + do_rescale=False, + do_pad=True, + size_divisor=14, + do_normalize=True, + image_mean=(123.675, 116.28, 103.53), + image_std=(58.395, 57.12, 57.375), + ) + + image = prepare_img() + pixel_values = processor(image, return_tensors="pt").pixel_values.float() + original_pixel_values = get_original_pixel_values(image) + + assert torch.allclose(pixel_values, original_pixel_values) + + # Verify forward pass + with torch.no_grad(): + outputs = model(pixel_values) + + predicted_depth = outputs.predicted_depth + + print("Shape of predicted depth:", predicted_depth.shape) + print("First values of predicted depth:", predicted_depth[0, :3, :3]) + + # assert logits + if verify_logits: + if model_name == "dpt-dinov2-small-nyu": + expected_shape = torch.Size([1, 576, 736]) + expected_slice = torch.tensor( + [[3.3576, 3.4741, 3.4345], [3.4324, 3.5012, 3.2775], [3.2560, 3.3563, 3.2354]] + ) + + assert predicted_depth.shape == torch.Size(expected_shape) + assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-5) + print("Looks ok!") + + if pytorch_dump_folder_path is not None: + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + print(f"Saving model and processor to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path) + processor.save_pretrained(pytorch_dump_folder_path) + + if push_to_hub: + print("Pushing model and processor to hub...") + model.push_to_hub(repo_id=f"facebook/{model_name}") + processor.push_to_hub(repo_id=f"facebook/{model_name}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--model_name", + default="dpt-dinov2-small-nyu", + type=str, + choices=name_to_url.keys(), + help="Name of the model you'd like to convert.", + ) + parser.add_argument( + "--pytorch_dump_folder_path", + default=None, + type=str, + help="Path to the output PyTorch model directory.", + ) + parser.add_argument( + "--push_to_hub", + action="store_true", + help="Whether to push the model to the hub after conversion.", + ) + parser.add_argument( + "--verify_logits", + action="store_true", + required=False, + help="Path to the output PyTorch model directory.", + ) + + args = parser.parse_args() + convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits) diff --git a/src/transformers/models/dpt/convert_dpt_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_to_pytorch.py index cee5be88c3..42637cb215 100644 --- a/src/transformers/models/dpt/convert_dpt_to_pytorch.py +++ b/src/transformers/models/dpt/convert_dpt_to_pytorch.py @@ -229,12 +229,14 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub if "ade" in checkpoint_url else torch.allclose(outputs[0, :3, :3], expected_slice) ) + print("Looks ok!") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - image_processor.save_pretrained(pytorch_dump_folder_path) + if pytorch_dump_folder_path is not None: + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + print(f"Saving model to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path) + print(f"Saving image processor to {pytorch_dump_folder_path}") + image_processor.save_pretrained(pytorch_dump_folder_path) if push_to_hub: print("Pushing model to hub...") @@ -265,7 +267,7 @@ if __name__ == "__main__": "--pytorch_dump_folder_path", default=None, type=str, - required=True, + required=False, help="Path to the output PyTorch model directory.", ) parser.add_argument( @@ -276,6 +278,7 @@ if __name__ == "__main__": "--model_name", default="dpt-large", type=str, + required=False, help="Name of the model, in case you're pushing to the hub.", ) diff --git a/src/transformers/models/dpt/image_processing_dpt.py b/src/transformers/models/dpt/image_processing_dpt.py index 93374dbd92..49403140b9 100644 --- a/src/transformers/models/dpt/image_processing_dpt.py +++ b/src/transformers/models/dpt/image_processing_dpt.py @@ -20,7 +20,7 @@ from typing import Dict, Iterable, List, Optional, Tuple, Union import numpy as np from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict -from ...image_transforms import resize, to_channel_dimension_format +from ...image_transforms import pad, resize, to_channel_dimension_format from ...image_utils import ( IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, @@ -122,6 +122,12 @@ class DPTImageProcessor(BaseImageProcessor): image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): Standard deviation to use if normalizing the image. This is a float or list of floats the length of the number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + do_pad (`bool`, *optional*, defaults to `False`): + Whether to apply center padding. This was introduced in the DINOv2 paper, which uses the model in + combination with DPT. + size_divisor (`int`, *optional*): + If `do_pad` is `True`, pads the image dimensions to be divisible by this value. This was introduced in the + DINOv2 paper, which uses the model in combination with DPT. """ model_input_names = ["pixel_values"] @@ -138,6 +144,8 @@ class DPTImageProcessor(BaseImageProcessor): do_normalize: bool = True, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, + do_pad: bool = False, + size_divisor: int = None, **kwargs, ) -> None: super().__init__(**kwargs) @@ -153,6 +161,8 @@ class DPTImageProcessor(BaseImageProcessor): self.do_normalize = do_normalize self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD + self.do_pad = do_pad + self.size_divisor = size_divisor def resize( self, @@ -208,6 +218,51 @@ class DPTImageProcessor(BaseImageProcessor): **kwargs, ) + def pad_image( + self, + image: np.array, + size_divisor: int, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """ + Center pad an image to be a multiple of `multiple`. + + Args: + image (`np.ndarray`): + Image to pad. + size_divisor (`int`): + The width and height of the image will be padded to a multiple of this number. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + + def _get_pad(size, size_divisor): + new_size = math.ceil(size / size_divisor) * size_divisor + pad_size = new_size - size + pad_size_left = pad_size // 2 + pad_size_right = pad_size - pad_size_left + return pad_size_left, pad_size_right + + if input_data_format is None: + input_data_format = infer_channel_dimension_format(image) + + height, width = get_image_size(image, input_data_format) + + pad_size_left, pad_size_right = _get_pad(height, size_divisor) + pad_size_top, pad_size_bottom = _get_pad(width, size_divisor) + + return pad(image, ((pad_size_left, pad_size_right), (pad_size_top, pad_size_bottom)), data_format=data_format) + def preprocess( self, images: ImageInput, @@ -221,6 +276,8 @@ class DPTImageProcessor(BaseImageProcessor): do_normalize: bool = None, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, + do_pad: bool = None, + size_divisor: int = None, return_tensors: Optional[Union[str, TensorType]] = None, data_format: ChannelDimension = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, @@ -286,6 +343,8 @@ class DPTImageProcessor(BaseImageProcessor): do_normalize = do_normalize if do_normalize is not None else self.do_normalize image_mean = image_mean if image_mean is not None else self.image_mean image_std = image_std if image_std is not None else self.image_std + do_pad = do_pad if do_pad is not None else self.do_pad + size_divisor = size_divisor if size_divisor is not None else self.size_divisor images = make_list_of_images(images) @@ -304,6 +363,9 @@ class DPTImageProcessor(BaseImageProcessor): if do_normalize and (image_mean is None or image_std is None): raise ValueError("Image mean and std must be specified if do_normalize is True.") + if do_pad and size_divisor is None: + raise ValueError("Size divisibility must be specified if do_pad is True.") + # All transformations expect numpy arrays. images = [to_numpy_array(image) for image in images] @@ -335,6 +397,12 @@ class DPTImageProcessor(BaseImageProcessor): for image in images ] + if do_pad: + images = [ + self.pad_image(image=image, size_divisor=size_divisor, input_data_format=input_data_format) + for image in images + ] + images = [ to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images ] diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py index 63796d0168..ca44b6a42a 100755 --- a/src/transformers/models/dpt/modeling_dpt.py +++ b/src/transformers/models/dpt/modeling_dpt.py @@ -599,12 +599,13 @@ class DPTReassembleStage(nn.Module): # When using DPT-Hybrid the readout type is set to "project". The sanity check is done on the config file self.readout_projects = nn.ModuleList() + hidden_size = _get_backbone_hidden_size(config) for i in range(len(config.neck_hidden_sizes)): if i <= 1: self.readout_projects.append(nn.Sequential(nn.Identity())) elif i > 1: self.readout_projects.append( - nn.Sequential(nn.Linear(2 * config.hidden_size, config.hidden_size), ACT2FN[config.hidden_act]) + nn.Sequential(nn.Linear(2 * hidden_size, hidden_size), ACT2FN[config.hidden_act]) ) def _init_reassemble_dpt(self, config): @@ -613,12 +614,13 @@ class DPTReassembleStage(nn.Module): if config.readout_type == "project": self.readout_projects = nn.ModuleList() + hidden_size = _get_backbone_hidden_size(config) for _ in range(len(config.neck_hidden_sizes)): self.readout_projects.append( - nn.Sequential(nn.Linear(2 * config.hidden_size, config.hidden_size), ACT2FN[config.hidden_act]) + nn.Sequential(nn.Linear(2 * hidden_size, hidden_size), ACT2FN[config.hidden_act]) ) - def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]: + def forward(self, hidden_states: List[torch.Tensor], patch_height=None, patch_width=None) -> List[torch.Tensor]: """ Args: hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`): @@ -628,21 +630,24 @@ class DPTReassembleStage(nn.Module): for i, hidden_state in enumerate(hidden_states): if i not in self.neck_ignore_stages: - # reshape to (B, C, H, W) - hidden_state, cls_token = hidden_state[:, 1:], hidden_state[:, 0] + # reshape to (batch_size, num_channels, height, width) + cls_token, hidden_state = hidden_state[:, 0], hidden_state[:, 1:] batch_size, sequence_length, num_channels = hidden_state.shape - size = int(math.sqrt(sequence_length)) - hidden_state = hidden_state.reshape(batch_size, size, size, num_channels) + if patch_height is not None and patch_width is not None: + hidden_state = hidden_state.reshape(batch_size, patch_height, patch_width, num_channels) + else: + size = int(math.sqrt(sequence_length)) + hidden_state = hidden_state.reshape(batch_size, size, size, num_channels) hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous() feature_shape = hidden_state.shape if self.config.readout_type == "project": - # reshape to (B, H*W, C) + # reshape to (batch_size, height*width, num_channels) hidden_state = hidden_state.flatten(2).permute((0, 2, 1)) readout = cls_token.unsqueeze(1).expand_as(hidden_state) # concatenate the readout token to the hidden states and project hidden_state = self.readout_projects[i](torch.cat((hidden_state, readout), -1)) - # reshape back to (B, C, H, W) + # reshape back to (batch_size, num_channels, height, width) hidden_state = hidden_state.permute(0, 2, 1).reshape(feature_shape) elif self.config.readout_type == "add": hidden_state = hidden_state.flatten(2) + cls_token.unsqueeze(-1) @@ -653,11 +658,19 @@ class DPTReassembleStage(nn.Module): return out +def _get_backbone_hidden_size(config): + if config.backbone_config is not None and config.is_hybrid is False: + return config.backbone_config.hidden_size + else: + return config.hidden_size + + class DPTReassembleLayer(nn.Module): def __init__(self, config, channels, factor): super().__init__() # projection - self.projection = nn.Conv2d(in_channels=config.hidden_size, out_channels=channels, kernel_size=1) + hidden_size = _get_backbone_hidden_size(config) + self.projection = nn.Conv2d(in_channels=hidden_size, out_channels=channels, kernel_size=1) # up/down sampling depending on factor if factor > 1: @@ -710,24 +723,30 @@ class DPTPreActResidualLayer(nn.Module): super().__init__() self.use_batch_norm = config.use_batch_norm_in_fusion_residual - self.activation1 = ACT2FN["relu"] + use_bias_in_fusion_residual = ( + config.use_bias_in_fusion_residual + if config.use_bias_in_fusion_residual is not None + else not self.use_batch_norm + ) + + self.activation1 = nn.ReLU() self.convolution1 = nn.Conv2d( config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=3, stride=1, padding=1, - bias=not self.use_batch_norm, + bias=use_bias_in_fusion_residual, ) - self.activation2 = ACT2FN["relu"] + self.activation2 = nn.ReLU() self.convolution2 = nn.Conv2d( config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=3, stride=1, padding=1, - bias=not self.use_batch_norm, + bias=use_bias_in_fusion_residual, ) if self.use_batch_norm: @@ -973,8 +992,12 @@ class DPTNeck(nn.Module): super().__init__() self.config = config - # postprocessing - self.reassemble_stage = DPTReassembleStage(config) + # postprocessing: only required in case of a non-hierarchical backbone (e.g. ViT, BEiT) + if config.backbone_config is not None and config.backbone_config.model_type in ["swinv2"]: + self.reassemble_stage = None + else: + self.reassemble_stage = DPTReassembleStage(config) + self.convs = nn.ModuleList() for channel in config.neck_hidden_sizes: self.convs.append(nn.Conv2d(channel, config.fusion_hidden_size, kernel_size=3, padding=1, bias=False)) @@ -982,17 +1005,23 @@ class DPTNeck(nn.Module): # fusion self.fusion_stage = DPTFeatureFusionStage(config) - def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]: - if not isinstance(hidden_states, list): - raise ValueError("hidden_states should be a list of tensors") + def forward(self, hidden_states: List[torch.Tensor], patch_height=None, patch_width=None) -> List[torch.Tensor]: + """ + Args: + hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`): + List of hidden states from the backbone. + """ + if not isinstance(hidden_states, (tuple, list)): + raise ValueError("hidden_states should be a tuple or list of tensors") if len(hidden_states) != len(self.config.neck_hidden_sizes): raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.") # postprocess hidden states - features = self.reassemble_stage(hidden_states) + if self.reassemble_stage is not None: + hidden_states = self.reassemble_stage(hidden_states, patch_height, patch_width) - features = [self.convs[i](feature) for i, feature in enumerate(features)] + features = [self.convs[i](feature) for i, feature in enumerate(hidden_states)] # fusion blocks output = self.fusion_stage(features) @@ -1012,20 +1041,28 @@ class DPTDepthEstimationHead(nn.Module): self.config = config + self.projection = None + if config.add_projection: + self.projection = nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + features = config.fusion_hidden_size self.head = nn.Sequential( nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1), nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True), nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1), - ACT2FN["relu"], + nn.ReLU(), nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), - ACT2FN["relu"], + nn.ReLU(), ) def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor: # use last features hidden_states = hidden_states[self.config.head_in_index] + if self.projection is not None: + hidden_states = self.projection(hidden_states) + hidden_states = nn.ReLU()(hidden_states) + predicted_depth = self.head(hidden_states) predicted_depth = predicted_depth.squeeze(dim=1) @@ -1043,7 +1080,11 @@ class DPTForDepthEstimation(DPTPreTrainedModel): def __init__(self, config): super().__init__(config) - self.dpt = DPTModel(config, add_pooling_layer=False) + self.backbone = None + if config.backbone_config is not None and config.is_hybrid is False: + self.backbone = AutoBackbone.from_config(config.backbone_config) + else: + self.dpt = DPTModel(config, add_pooling_layer=False) # Neck self.neck = DPTNeck(config) @@ -1109,32 +1150,46 @@ class DPTForDepthEstimation(DPTPreTrainedModel): output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - outputs = self.dpt( - pixel_values, - head_mask=head_mask, - output_attentions=output_attentions, - output_hidden_states=True, # we need the intermediate hidden states - return_dict=return_dict, - ) - - hidden_states = outputs.hidden_states if return_dict else outputs[1] - - # only keep certain features based on config.backbone_out_indices - # note that the hidden_states also include the initial embeddings - if not self.config.is_hybrid: - hidden_states = [ - feature for idx, feature in enumerate(hidden_states[1:]) if idx in self.config.backbone_out_indices - ] - else: - backbone_hidden_states = outputs.intermediate_activations if return_dict else list(outputs[-1]) - backbone_hidden_states.extend( - feature for idx, feature in enumerate(hidden_states[1:]) if idx in self.config.backbone_out_indices[2:] + if self.backbone is not None: + outputs = self.backbone.forward_with_filtered_kwargs( + pixel_values, output_hidden_states=output_hidden_states, output_attentions=output_attentions ) + hidden_states = outputs.feature_maps + else: + outputs = self.dpt( + pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=True, # we need the intermediate hidden states + return_dict=return_dict, + ) + hidden_states = outputs.hidden_states if return_dict else outputs[1] + # only keep certain features based on config.backbone_out_indices + # note that the hidden_states also include the initial embeddings + if not self.config.is_hybrid: + hidden_states = [ + feature for idx, feature in enumerate(hidden_states[1:]) if idx in self.config.backbone_out_indices + ] + else: + backbone_hidden_states = outputs.intermediate_activations if return_dict else list(outputs[-1]) + backbone_hidden_states.extend( + feature + for idx, feature in enumerate(hidden_states[1:]) + if idx in self.config.backbone_out_indices[2:] + ) - hidden_states = backbone_hidden_states + hidden_states = backbone_hidden_states - hidden_states = self.neck(hidden_states) + patch_height, patch_width = None, None + if self.config.backbone_config is not None and self.config.is_hybrid is False: + _, _, height, width = pixel_values.shape + patch_size = self.config.backbone_config.patch_size + patch_height = height // patch_size + patch_width = width // patch_size + + hidden_states = self.neck(hidden_states, patch_height, patch_width) predicted_depth = self.head(hidden_states) @@ -1167,7 +1222,7 @@ class DPTSemanticSegmentationHead(nn.Module): self.head = nn.Sequential( nn.Conv2d(features, features, kernel_size=3, padding=1, bias=False), nn.BatchNorm2d(features), - ACT2FN["relu"], + nn.ReLU(), nn.Dropout(config.semantic_classifier_dropout), nn.Conv2d(features, config.num_labels, kernel_size=1), nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True), @@ -1190,7 +1245,7 @@ class DPTAuxiliaryHead(nn.Module): self.head = nn.Sequential( nn.Conv2d(features, features, kernel_size=3, padding=1, bias=False), nn.BatchNorm2d(features), - ACT2FN["relu"], + nn.ReLU(), nn.Dropout(0.1, False), nn.Conv2d(features, config.num_labels, kernel_size=1), ) @@ -1287,7 +1342,7 @@ class DPTForSemanticSegmentation(DPTPreTrainedModel): hidden_states = backbone_hidden_states - hidden_states = self.neck(hidden_states) + hidden_states = self.neck(hidden_states=hidden_states) logits = self.head(hidden_states) diff --git a/tests/models/dpt/test_image_processing_dpt.py b/tests/models/dpt/test_image_processing_dpt.py index 5670d50b91..a70165048b 100644 --- a/tests/models/dpt/test_image_processing_dpt.py +++ b/tests/models/dpt/test_image_processing_dpt.py @@ -16,6 +16,8 @@ import unittest +import numpy as np + from transformers.file_utils import is_vision_available from transformers.testing_utils import require_torch, require_vision @@ -97,6 +99,10 @@ class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): self.assertTrue(hasattr(image_processing, "do_normalize")) self.assertTrue(hasattr(image_processing, "do_resize")) self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_rescale")) + self.assertTrue(hasattr(image_processing, "rescale_factor")) + self.assertTrue(hasattr(image_processing, "do_pad")) + self.assertTrue(hasattr(image_processing, "size_divisor")) def test_image_processor_from_dict_with_kwargs(self): image_processor = self.image_processing_class.from_dict(self.image_processor_dict) @@ -104,3 +110,19 @@ class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42) self.assertEqual(image_processor.size, {"height": 42, "width": 42}) + + def test_padding(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + image = np.random.randn(3, 249, 491) + + # test individual method + image = image_processing.pad_image(image, size_divisor=4) + self.assertTrue(image.shape[1] % 4 == 0) + self.assertTrue(image.shape[2] % 4 == 0) + + # test by calling + pixel_values = image_processing.preprocess( + image, do_rescale=False, do_resize=False, do_pad=True, size_divisor=4, return_tensors="pt" + ).pixel_values + self.assertTrue(pixel_values.shape[2] % 4 == 0) + self.assertTrue(pixel_values.shape[3] % 4 == 0) diff --git a/tests/models/dpt/test_modeling_dpt_auto_backbone.py b/tests/models/dpt/test_modeling_dpt_auto_backbone.py new file mode 100644 index 0000000000..95e3128ff0 --- /dev/null +++ b/tests/models/dpt/test_modeling_dpt_auto_backbone.py @@ -0,0 +1,294 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the PyTorch DPT model. """ + + +import inspect +import unittest + +from transformers import Dinov2Config, DPTConfig +from transformers.file_utils import is_torch_available, is_vision_available +from transformers.models.auto import get_values +from transformers.testing_utils import require_torch, require_vision, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import MODEL_MAPPING, DPTForDepthEstimation + from transformers.models.dpt.modeling_dpt import DPT_PRETRAINED_MODEL_ARCHIVE_LIST + + +if is_vision_available(): + from PIL import Image + + from transformers import DPTImageProcessor + + +class DPTModelTester: + def __init__( + self, + parent, + batch_size=2, + num_channels=3, + image_size=32, + patch_size=16, + use_labels=True, + num_labels=3, + is_training=True, + hidden_size=4, + num_hidden_layers=2, + num_attention_heads=2, + intermediate_size=8, + out_features=["stage1", "stage2"], + apply_layernorm=False, + reshape_hidden_states=False, + neck_hidden_sizes=[2, 2], + fusion_hidden_size=6, + ): + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.patch_size = patch_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.out_features = out_features + self.apply_layernorm = apply_layernorm + self.reshape_hidden_states = reshape_hidden_states + self.use_labels = use_labels + self.num_labels = num_labels + self.is_training = is_training + self.neck_hidden_sizes = neck_hidden_sizes + self.fusion_hidden_size = fusion_hidden_size + # DPT's sequence length + self.seq_length = (self.image_size // self.patch_size) ** 2 + 1 + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return DPTConfig( + backbone_config=self.get_backbone_config(), + neck_hidden_sizes=self.neck_hidden_sizes, + fusion_hidden_size=self.fusion_hidden_size, + ) + + def get_backbone_config(self): + return Dinov2Config( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + is_training=self.is_training, + out_features=self.out_features, + reshape_hidden_states=self.reshape_hidden_states, + ) + + def create_and_check_for_depth_estimation(self, config, pixel_values, labels): + config.num_labels = self.num_labels + model = DPTForDepthEstimation(config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.image_size, self.image_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class DPTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as DPT does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (DPTForDepthEstimation,) if is_torch_available() else () + pipeline_model_mapping = {"depth-estimation": DPTForDepthEstimation} if is_torch_available() else {} + + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + + def setUp(self): + self.model_tester = DPTModelTester(self) + self.config_tester = ConfigTester(self, config_class=DPTConfig, has_text_modality=False, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="DPT with AutoBackbone does not have a base model and hence no input_embeddings") + def test_inputs_embeds(self): + pass + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_for_depth_estimation(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs) + + def test_training(self): + for model_class in self.all_model_classes: + if model_class.__name__ == "DPTForDepthEstimation": + continue + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + if model_class in get_values(MODEL_MAPPING): + continue + + model = model_class(config) + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + def test_training_gradient_checkpointing(self): + for model_class in self.all_model_classes: + if model_class.__name__ == "DPTForDepthEstimation": + continue + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.use_cache = False + config.return_dict = True + + if model_class in get_values(MODEL_MAPPING) or not model_class.supports_gradient_checkpointing: + continue + model = model_class(config) + model.to(torch_device) + model.gradient_checkpointing_enable() + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + # Skip the check for the backbone + backbone_params = [] + for name, module in model.named_modules(): + if module.__class__.__name__ == "DPTViTHybridEmbeddings": + backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()] + break + + for name, param in model.named_parameters(): + if param.requires_grad: + if name in backbone_params: + continue + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + @unittest.skip(reason="DPT with AutoBackbone does not have a base model and hence no input_embeddings") + def test_model_common_attributes(self): + pass + + @unittest.skip(reason="DPT with AutoBackbone does not have a base model") + def test_save_load_fast_init_from_base(self): + pass + + @unittest.skip(reason="DPT with AutoBackbone does not have a base model") + def test_save_load_fast_init_to_base(self): + pass + + @unittest.skip( + reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @slow + def test_model_from_pretrained(self): + for model_name in DPT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = DPTForDepthEstimation.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +@slow +class DPTModelIntegrationTest(unittest.TestCase): + def test_inference_depth_estimation(self): + image_processor = DPTImageProcessor.from_pretrained("facebook/dpt-dinov2-small-kitti") + model = DPTForDepthEstimation.from_pretrained("facebook/dpt-dinov2-small-kitti").to(torch_device) + + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + predicted_depth = outputs.predicted_depth + + # verify the predicted depth + expected_shape = torch.Size((1, 576, 736)) + self.assertEqual(predicted_depth.shape, expected_shape) + + expected_slice = torch.tensor( + [[6.0433, 7.1636, 7.4268], [6.9047, 7.2471, 7.2355], [7.9261, 8.0631, 8.0244]] + ).to(torch_device) + + self.assertTrue(torch.allclose(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4))