From 979b039c893f771bc77c4089d841af89927759a1 Mon Sep 17 00:00:00 2001 From: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Date: Mon, 28 Mar 2022 16:28:10 +0200 Subject: [PATCH] Add DPT (#15991) * First draft * More improvements * Add fusion blocks * Make conversion script work for dpt_large * Make conversion script work * Improve implementation * Improve conversion script * Add DPTForSemanticSegmentation * Make conversion work for semantic segmentation * Add tests * Remove print statements * First draft * Redesign neck * Improve tests * Improve implementation some more * Make neck output list of tensors * Improve neck and feature extractor * Fix integration tests * Make more tests pass * Make all tests pass * Add missing config archive map * Add in_index attribute to make heads accept list of tensors * Apply suggestions from code review * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Apply some more suggestions * Add copied from statements * Remove assert * Apply suggestions from code review * Apply suggestions from code review * Remove DPTInterpolate in favor of nn.Upsample * Add comments * Apply suggestions from code review * Apply suggestions from code review * Add proposed design * Update design * Add DPTReassembleLayer * Add DPTFeatureFusionStage * Apply more suggestions from code review * Apply suggestions from code review * Apply suggestions from code review * Fix rebase * Update in_index and out_indices * Fix conversion script * Fix code quality * Add model to toctree and use DepthEstimatorOutput * Fix rebase * Fix code examples * Improve code * Fix copied from statements * Apply suggestions from code review * Remove compute_loss method * Apply suggestions from code review * Fix documentation tests file * Remove test.py file * Improve doc example Co-authored-by: Niels Rogge Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Niels Rogge --- README.md | 1 + README_ko.md | 1 + README_zh-hans.md | 1 + README_zh-hant.md | 1 + docs/source/_toctree.yml | 2 + docs/source/index.mdx | 2 + docs/source/model_doc/dpt.mdx | 57 + src/transformers/__init__.py | 21 +- src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 3 + src/transformers/models/auto/modeling_auto.py | 2 + src/transformers/models/dpt/__init__.py | 59 + .../models/dpt/configuration_dpt.py | 170 +++ .../models/dpt/convert_dpt_to_pytorch.py | 283 +++++ .../models/dpt/feature_extraction_dpt.py | 202 +++ src/transformers/models/dpt/modeling_dpt.py | 1132 +++++++++++++++++ src/transformers/utils/dummy_pt_objects.py | 31 + .../utils/dummy_vision_objects.py | 7 + tests/dpt/__init__.py | 0 tests/dpt/test_feature_extraction_dpt.py | 188 +++ tests/dpt/test_modeling_dpt.py | 393 ++++++ tests/test_modeling_common.py | 6 + utils/check_repo.py | 1 + utils/documentation_tests.txt | 3 +- 24 files changed, 2565 insertions(+), 2 deletions(-) create mode 100644 docs/source/model_doc/dpt.mdx create mode 100644 src/transformers/models/dpt/__init__.py create mode 100644 src/transformers/models/dpt/configuration_dpt.py create mode 100644 src/transformers/models/dpt/convert_dpt_to_pytorch.py create mode 100644 src/transformers/models/dpt/feature_extraction_dpt.py create mode 100755 src/transformers/models/dpt/modeling_dpt.py create mode 100644 tests/dpt/__init__.py create mode 100644 tests/dpt/test_feature_extraction_dpt.py create mode 100644 tests/dpt/test_modeling_dpt.py diff --git a/README.md b/README.md index b3c1f3a7c6..024e452201 100644 --- a/README.md +++ b/README.md @@ -261,6 +261,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. +1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. diff --git a/README_ko.md b/README_ko.md index 2ed9ea5568..5d813b0cc7 100644 --- a/README_ko.md +++ b/README_ko.md @@ -240,6 +240,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) and a German version of DistilBERT. 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei. 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. +1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. diff --git a/README_zh-hans.md b/README_zh-hans.md index 0e649be5de..5570335d49 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -264,6 +264,7 @@ conda install -c huggingface transformers 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (来自 HuggingFace), 伴随论文 [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) 由 Victor Sanh, Lysandre Debut and Thomas Wolf 发布。 同样的方法也应用于压缩 GPT-2 到 [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa 到 [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation), Multilingual BERT 到 [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) 和德语版 DistilBERT。 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (来自 Microsoft Research) 伴随论文 [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) 由 Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei 发布。 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (来自 Facebook) 伴随论文 [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 由 Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 发布。 +1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (来自 Intel Labs) 伴随论文 [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) 由 René Ranftl, Alexey Bochkovskiy, Vladlen Koltun 发布。 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (来自 Google Research/Stanford University) 伴随论文 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 由 Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 发布。 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (来自 Google Research) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (来自 CNRS) 伴随论文 [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) 由 Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab 发布。 diff --git a/README_zh-hant.md b/README_zh-hant.md index 56475a49a4..88ba012d5a 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -276,6 +276,7 @@ conda install -c huggingface transformers 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) and a German version of DistilBERT. 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei. 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. +1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 54b60ea7c4..0be8d4b9ec 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -206,6 +206,8 @@ title: DiT - local: model_doc/dpr title: DPR + - local: model_doc/dpt + title: DPT - local: model_doc/electra title: ELECTRA - local: model_doc/encoder-decoder diff --git a/docs/source/index.mdx b/docs/source/index.mdx index ee3ae17b2b..af1608dd4c 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -82,6 +82,7 @@ The library currently contains JAX, PyTorch and TensorFlow implementations, pret 1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan. 1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT. 1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. +1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. 1. **[EncoderDecoder](model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. 1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. 1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. @@ -194,6 +195,7 @@ Flax), PyTorch, and/or TensorFlow. | DETR | ❌ | ❌ | ✅ | ❌ | ❌ | | DistilBERT | ✅ | ✅ | ✅ | ✅ | ✅ | | DPR | ✅ | ✅ | ✅ | ✅ | ❌ | +| DPT | ❌ | ❌ | ✅ | ❌ | ❌ | | ELECTRA | ✅ | ✅ | ✅ | ✅ | ✅ | | Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ | | FairSeq Machine-Translation | ✅ | ❌ | ✅ | ❌ | ❌ | diff --git a/docs/source/model_doc/dpt.mdx b/docs/source/model_doc/dpt.mdx new file mode 100644 index 0000000000..cdf009c6c8 --- /dev/null +++ b/docs/source/model_doc/dpt.mdx @@ -0,0 +1,57 @@ + + +# DPT + +## Overview + +The DPT model was proposed in [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. +DPT is a model that leverages the [Vision Transformer (ViT)](vit) as backbone for dense prediction tasks like semantic segmentation and depth estimation. + +The abstract from the paper is the following: + +*We introduce dense vision transformers, an architecture that leverages vision transformers in place of convolutional networks as a backbone for dense prediction tasks. We assemble tokens from various stages of the vision transformer into image-like representations at various resolutions and progressively combine them into full-resolution predictions using a convolutional decoder. The transformer backbone processes representations at a constant and relatively high resolution and has a global receptive field at every stage. These properties allow the dense vision transformer to provide finer-grained and more globally coherent predictions when compared to fully-convolutional networks. Our experiments show that this architecture yields substantial improvements on dense prediction tasks, especially when a large amount of training data is available. For monocular depth estimation, we observe an improvement of up to 28% in relative performance when compared to a state-of-the-art fully-convolutional network. When applied to semantic segmentation, dense vision transformers set a new state of the art on ADE20K with 49.02% mIoU. We further show that the architecture can be fine-tuned on smaller datasets such as NYUv2, KITTI, and Pascal Context where it also sets the new state of the art.* + + + + DPT architecture. Taken from the original paper. + +This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/isl-org/DPT). + +## DPTConfig + +[[autodoc]] DPTConfig + + +## DPTFeatureExtractor + +[[autodoc]] DPTFeatureExtractor + - __call__ + + +## DPTModel + +[[autodoc]] DPTModel + - forward + + +## DPTForDepthEstimation + +[[autodoc]] DPTForDepthEstimation + - forward + + +## DPTForSemanticSegmentation + +[[autodoc]] DPTForSemanticSegmentation + - forward \ No newline at end of file diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index ad61a63062..0760618c69 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -187,6 +187,7 @@ _import_structure = { "DPRReaderOutput", "DPRReaderTokenizer", ], + "models.dpt": ["DPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DPTConfig"], "models.electra": ["ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP", "ElectraConfig", "ElectraTokenizer"], "models.encoder_decoder": ["EncoderDecoderConfig"], "models.flaubert": ["FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FlaubertConfig", "FlaubertTokenizer"], @@ -528,6 +529,7 @@ if is_vision_available(): _import_structure["models.convnext"].append("ConvNextFeatureExtractor") _import_structure["models.deit"].append("DeiTFeatureExtractor") _import_structure["models.detr"].append("DetrFeatureExtractor") + _import_structure["models.dpt"].append("DPTFeatureExtractor") _import_structure["models.glpn"].append("GLPNFeatureExtractor") _import_structure["models.imagegpt"].append("ImageGPTFeatureExtractor") _import_structure["models.layoutlmv2"].append("LayoutLMv2FeatureExtractor") @@ -947,6 +949,15 @@ if is_torch_available(): "DPRReader", ] ) + _import_structure["models.dpt"].extend( + [ + "DPT_PRETRAINED_MODEL_ARCHIVE_LIST", + "DPTForDepthEstimation", + "DPTForSemanticSegmentation", + "DPTModel", + "DPTPreTrainedModel", + ] + ) _import_structure["models.electra"].extend( [ "ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -2543,6 +2554,7 @@ if TYPE_CHECKING: DPRReaderOutput, DPRReaderTokenizer, ) + from .models.dpt import DPT_PRETRAINED_CONFIG_ARCHIVE_MAP, DPTConfig from .models.electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig, ElectraTokenizer from .models.encoder_decoder import EncoderDecoderConfig from .models.flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig, FlaubertTokenizer @@ -2838,6 +2850,7 @@ if TYPE_CHECKING: from .models.convnext import ConvNextFeatureExtractor from .models.deit import DeiTFeatureExtractor from .models.detr import DetrFeatureExtractor + from .models.dpt import DPTFeatureExtractor from .models.glpn import GLPNFeatureExtractor from .models.imagegpt import ImageGPTFeatureExtractor from .models.layoutlmv2 import LayoutLMv2FeatureExtractor, LayoutLMv2Processor @@ -2877,7 +2890,6 @@ if TYPE_CHECKING: from .utils.dummy_scatter_objects import * if is_torch_available(): - # Benchmarks from .benchmark.benchmark import PyTorchBenchmark from .benchmark.benchmark_args import PyTorchBenchmarkArguments @@ -3188,6 +3200,13 @@ if TYPE_CHECKING: DPRQuestionEncoder, DPRReader, ) + from .models.dpt import ( + DPT_PRETRAINED_MODEL_ARCHIVE_LIST, + DPTForDepthEstimation, + DPTForSemanticSegmentation, + DPTModel, + DPTPreTrainedModel, + ) from .models.electra import ( ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST, ElectraForCausalLM, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 5eedfdc95d..7045f18c55 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -50,6 +50,7 @@ from . import ( distilbert, dit, dpr, + dpt, electra, encoder_decoder, flaubert, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 6b61edd8ae..1899c3c249 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -29,6 +29,7 @@ logger = logging.get_logger(__name__) CONFIG_MAPPING_NAMES = OrderedDict( [ # Add configs here + ("dpt", "DPTConfig"), ("decision_transformer", "DecisionTransformerConfig"), ("glpn", "GLPNConfig"), ("maskformer", "MaskFormerConfig"), @@ -134,6 +135,7 @@ CONFIG_MAPPING_NAMES = OrderedDict( CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict( [ # Add archive maps here + ("dpt", "DPT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("glpn", "GLPN_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("maskformer", "MASKFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("poolformer", "POOLFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -224,6 +226,7 @@ CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict( MODEL_NAMES_MAPPING = OrderedDict( [ # Add full (and cased) model names here + ("dpt", "DPT"), ("decision_transformer", "Decision Transformer"), ("glpn", "GLPN"), ("maskformer", "MaskFormer"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 735850da6c..b0cfb47672 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -28,6 +28,7 @@ logger = logging.get_logger(__name__) MODEL_MAPPING_NAMES = OrderedDict( [ # Base model mapping + ("dpt", "DPTModel"), ("decision_transformer", "DecisionTransformerModel"), ("glpn", "GLPNModel"), ("maskformer", "MaskFormerModel"), @@ -319,6 +320,7 @@ MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = OrderedDict( # Model for Semantic Segmentation mapping ("beit", "BeitForSemanticSegmentation"), ("segformer", "SegformerForSemanticSegmentation"), + ("dpt", "DPTForSemanticSegmentation"), ] ) diff --git a/src/transformers/models/dpt/__init__.py b/src/transformers/models/dpt/__init__.py new file mode 100644 index 0000000000..ba895de9b9 --- /dev/null +++ b/src/transformers/models/dpt/__init__.py @@ -0,0 +1,59 @@ +# flake8: noqa +# There's no way to ignore "F401 '...' imported but unused" warnings in this +# module, but to preserve other warnings. So, don't check this module at all. + +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available, is_vision_available + + +_import_structure = { + "configuration_dpt": ["DPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DPTConfig"], +} + +if is_vision_available(): + _import_structure["feature_extraction_dpt"] = ["DPTFeatureExtractor"] + +if is_torch_available(): + _import_structure["modeling_dpt"] = [ + "DPT_PRETRAINED_MODEL_ARCHIVE_LIST", + "DPTForDepthEstimation", + "DPTForSemanticSegmentation", + "DPTModel", + "DPTPreTrainedModel", + ] + + +if TYPE_CHECKING: + from .configuration_dpt import DPT_PRETRAINED_CONFIG_ARCHIVE_MAP, DPTConfig + + if is_vision_available(): + from .feature_extraction_dpt import DPTFeatureExtractor + + if is_torch_available(): + from .modeling_dpt import ( + DPT_PRETRAINED_MODEL_ARCHIVE_LIST, + DPTForDepthEstimation, + DPTForSemanticSegmentation, + DPTModel, + DPTPreTrainedModel, + ) + + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/dpt/configuration_dpt.py b/src/transformers/models/dpt/configuration_dpt.py new file mode 100644 index 0000000000..a255b0596b --- /dev/null +++ b/src/transformers/models/dpt/configuration_dpt.py @@ -0,0 +1,170 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" DPT model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + +DPT_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "Intel/dpt-large": "https://huggingface.co/Intel/dpt-large/resolve/main/config.json", + # See all DPT models at https://huggingface.co/models?filter=dpt +} + + +class DPTConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DPTModel`]. It is used to instantiate an DPT + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the DPT + [Intel/dpt-large](https://huggingface.co/Intel/dpt-large) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to 384): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries, keys and values. + backbone_out_indices (`List[int]`, *optional*, defaults to `[2, 5, 8, 11]`): + Indices of the intermediate hidden states to use from backbone. + readout_type (`str`, *optional*, defaults to `"project"`): + The readout type to use when processing the readout token (CLS token) of the intermediate hidden states of + the ViT backbone. Can be one of [`"ignore"`, `"add"`, `"project"`]. + + - "ignore" simply ignores the CLS token. + - "add" passes the information from the CLS token to all other tokens by adding the representations. + - "project" passes information to the other tokens by concatenating the readout to all other tokens before + projecting the + representation to the original feature dimension D using a linear layer followed by a GELU non-linearity. + reassemble_factors (`List[int]`, *optional*, defaults to `[4, 2, 1, 0.5]`): + The up/downsampling factors of the reassemble layers. + neck_hidden_sizes (`List[str]`, *optional*, defaults to [96, 192, 384, 768]): + The hidden sizes to project to for the feature maps of the backbone. + fusion_hidden_size (`int`, *optional*, defaults to 256): + The number of channels before fusion. + head_in_index (`int`, *optional*, defaults to -1): + The index of the features to use in the heads. + use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`): + Whether to use batch normalization in the pre-activate residual units of the fusion blocks. + use_auxiliary_head (`bool`, *optional*, defaults to `True`): + Whether to use an auxiliary head during training. + auxiliary_loss_weight (`float`, *optional*, defaults to 0.4): + Weight of the cross-entropy loss of the auxiliary head. + semantic_loss_ignore_index (`int`, *optional*, defaults to 255): + The index that is ignored by the loss function of the semantic segmentation model. + semantic_classifier_dropout (`float`, *optional*, defaults to 0.1): + The dropout ratio for the semantic classification head. + + Example: + + ```python + >>> from transformers import DPTModel, DPTConfig + + >>> # Initializing a DPT dpt-large style configuration + >>> configuration = DPTConfig() + + >>> # Initializing a model from the dpt-large style configuration + >>> model = DPTModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "dpt" + + def __init__( + self, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-12, + image_size=384, + patch_size=16, + num_channels=3, + qkv_bias=True, + backbone_out_indices=[2, 5, 8, 11], + readout_type="project", + reassemble_factors=[4, 2, 1, 0.5], + neck_hidden_sizes=[96, 192, 384, 768], + fusion_hidden_size=256, + head_in_index=-1, + use_batch_norm_in_fusion_residual=False, + use_auxiliary_head=True, + auxiliary_loss_weight=0.4, + semantic_loss_ignore_index=255, + semantic_classifier_dropout=0.1, + **kwargs + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.qkv_bias = qkv_bias + self.backbone_out_indices = backbone_out_indices + if readout_type not in ["ignore", "add", "project"]: + raise ValueError("Readout_type must be one of ['ignore', 'add', 'project']") + self.readout_type = readout_type + self.reassemble_factors = reassemble_factors + self.neck_hidden_sizes = neck_hidden_sizes + self.fusion_hidden_size = fusion_hidden_size + self.head_in_index = head_in_index + self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual + # auxiliary head attributes (semantic segmentation) + self.use_auxiliary_head = use_auxiliary_head + self.auxiliary_loss_weight = auxiliary_loss_weight + self.semantic_loss_ignore_index = semantic_loss_ignore_index + self.semantic_classifier_dropout = semantic_classifier_dropout diff --git a/src/transformers/models/dpt/convert_dpt_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_to_pytorch.py new file mode 100644 index 0000000000..0050f5e0a8 --- /dev/null +++ b/src/transformers/models/dpt/convert_dpt_to_pytorch.py @@ -0,0 +1,283 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert DPT checkpoints from the original repository. URL: https://github.com/isl-org/DPT""" + + +import argparse +import json +from pathlib import Path + +import torch +from PIL import Image + +import requests +from huggingface_hub import cached_download, hf_hub_url +from transformers import DPTConfig, DPTFeatureExtractor, DPTForDepthEstimation, DPTForSemanticSegmentation +from transformers.utils import logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + + +def get_dpt_config(checkpoint_url): + config = DPTConfig() + + if "large" in checkpoint_url: + config.hidden_size = 1024 + config.intermediate_size = 4096 + config.num_hidden_layers = 24 + config.num_attention_heads = 16 + config.backbone_out_indices = [5, 11, 17, 23] + config.neck_hidden_sizes = [256, 512, 1024, 1024] + expected_shape = (1, 384, 384) + + if "ade" in checkpoint_url: + config.use_batch_norm_in_fusion_residual = True + + config.num_labels = 150 + repo_id = "datasets/huggingface/label-files" + filename = "ade20k-id2label.json" + id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename)), "r")) + id2label = {int(k): v for k, v in id2label.items()} + config.id2label = id2label + config.label2id = {v: k for k, v in id2label.items()} + expected_shape = [1, 150, 480, 480] + + return config, expected_shape + + +def remove_ignore_keys_(state_dict): + ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"] + for k in ignore_keys: + state_dict.pop(k, None) + + +def rename_key(name): + if ( + "pretrained.model" in name + and "cls_token" not in name + and "pos_embed" not in name + and "patch_embed" not in name + ): + name = name.replace("pretrained.model", "dpt.encoder") + if "pretrained.model" in name: + name = name.replace("pretrained.model", "dpt.embeddings") + if "patch_embed" in name: + name = name.replace("patch_embed", "patch_embeddings") + if "pos_embed" in name: + name = name.replace("pos_embed", "position_embeddings") + if "attn.proj" in name: + name = name.replace("attn.proj", "attention.output.dense") + if "proj" in name and "project" not in name: + name = name.replace("proj", "projection") + if "blocks" in name: + name = name.replace("blocks", "layer") + if "mlp.fc1" in name: + name = name.replace("mlp.fc1", "intermediate.dense") + if "mlp.fc2" in name: + name = name.replace("mlp.fc2", "output.dense") + if "norm1" in name: + name = name.replace("norm1", "layernorm_before") + if "norm2" in name: + name = name.replace("norm2", "layernorm_after") + if "scratch.output_conv" in name: + name = name.replace("scratch.output_conv", "head") + if "scratch" in name: + name = name.replace("scratch", "neck") + if "layer1_rn" in name: + name = name.replace("layer1_rn", "convs.0") + if "layer2_rn" in name: + name = name.replace("layer2_rn", "convs.1") + if "layer3_rn" in name: + name = name.replace("layer3_rn", "convs.2") + if "layer4_rn" in name: + name = name.replace("layer4_rn", "convs.3") + if "refinenet" in name: + layer_idx = int(name[len("neck.refinenet") : len("neck.refinenet") + 1]) + # tricky here: we need to map 4 to 0, 3 to 1, 2 to 2 and 1 to 3 + name = name.replace(f"refinenet{layer_idx}", f"fusion_stage.layers.{abs(layer_idx-4)}") + if "out_conv" in name: + name = name.replace("out_conv", "projection") + if "resConfUnit1" in name: + name = name.replace("resConfUnit1", "residual_layer1") + if "resConfUnit2" in name: + name = name.replace("resConfUnit2", "residual_layer2") + if "conv1" in name: + name = name.replace("conv1", "convolution1") + if "conv2" in name: + name = name.replace("conv2", "convolution2") + # readout blocks + if "pretrained.act_postprocess1.0.project.0" in name: + name = name.replace("pretrained.act_postprocess1.0.project.0", "neck.reassemble_stage.readout_projects.0.0") + if "pretrained.act_postprocess2.0.project.0" in name: + name = name.replace("pretrained.act_postprocess2.0.project.0", "neck.reassemble_stage.readout_projects.1.0") + if "pretrained.act_postprocess3.0.project.0" in name: + name = name.replace("pretrained.act_postprocess3.0.project.0", "neck.reassemble_stage.readout_projects.2.0") + if "pretrained.act_postprocess4.0.project.0" in name: + name = name.replace("pretrained.act_postprocess4.0.project.0", "neck.reassemble_stage.readout_projects.3.0") + # resize blocks + if "pretrained.act_postprocess1.3" in name: + name = name.replace("pretrained.act_postprocess1.3", "neck.reassemble_stage.layers.0.projection") + if "pretrained.act_postprocess1.4" in name: + name = name.replace("pretrained.act_postprocess1.4", "neck.reassemble_stage.layers.0.resize") + if "pretrained.act_postprocess2.3" in name: + name = name.replace("pretrained.act_postprocess2.3", "neck.reassemble_stage.layers.1.projection") + if "pretrained.act_postprocess2.4" in name: + name = name.replace("pretrained.act_postprocess2.4", "neck.reassemble_stage.layers.1.resize") + if "pretrained.act_postprocess3.3" in name: + name = name.replace("pretrained.act_postprocess3.3", "neck.reassemble_stage.layers.2.projection") + if "pretrained.act_postprocess4.3" in name: + name = name.replace("pretrained.act_postprocess4.3", "neck.reassemble_stage.layers.3.projection") + if "pretrained.act_postprocess4.4" in name: + name = name.replace("pretrained.act_postprocess4.4", "neck.reassemble_stage.layers.3.resize") + if "pretrained" in name: + name = name.replace("pretrained", "dpt") + if "bn" in name: + name = name.replace("bn", "batch_norm") + if "head" in name: + name = name.replace("head", "head.head") + if "encoder.norm" in name: + name = name.replace("encoder.norm", "layernorm") + if "auxlayer" in name: + name = name.replace("auxlayer", "auxiliary_head.head") + + return name + + +# we split up the matrix of each encoder layer into queries, keys and values +def read_in_q_k_v(state_dict, config): + for i in range(config.num_hidden_layers): + # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) + in_proj_weight = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.weight") + in_proj_bias = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.bias") + # next, add query, keys and values (in that order) to the state dict + state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :] + state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size] + state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ + config.hidden_size : config.hidden_size * 2, : + ] + state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ + config.hidden_size : config.hidden_size * 2 + ] + state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[ + -config.hidden_size :, : + ] + state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :] + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + im = Image.open(requests.get(url, stream=True).raw) + return im + + +@torch.no_grad() +def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub, model_name): + """ + Copy/paste/tweak model's weights to our DPT structure. + """ + + # define DPT configuration based on URL + config, expected_shape = get_dpt_config(checkpoint_url) + # load original state_dict from URL + state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu") + # remove certain keys + remove_ignore_keys_(state_dict) + # rename keys + for key in state_dict.copy().keys(): + val = state_dict.pop(key) + state_dict[rename_key(key)] = val + # read in qkv matrices + read_in_q_k_v(state_dict, config) + + # load HuggingFace model + model = DPTForSemanticSegmentation(config) if "ade" in checkpoint_url else DPTForDepthEstimation(config) + model.load_state_dict(state_dict) + model.eval() + + # Check outputs on an image + size = 480 if "ade" in checkpoint_url else 384 + feature_extractor = DPTFeatureExtractor(size=size) + + image = prepare_img() + encoding = feature_extractor(image, return_tensors="pt") + + # forward pass + outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth + + # Assert logits + expected_slice = torch.tensor([[6.3199, 6.3629, 6.4148], [6.3850, 6.3615, 6.4166], [6.3519, 6.3176, 6.3575]]) + if "ade" in checkpoint_url: + expected_slice = torch.tensor([[4.0480, 4.2420, 4.4360], [4.3124, 4.5693, 4.8261], [4.5768, 4.8965, 5.2163]]) + assert outputs.shape == torch.Size(expected_shape) + assert ( + torch.allclose(outputs[0, 0, :3, :3], expected_slice, atol=1e-4) + if "ade" in checkpoint_url + else torch.allclose(outputs[0, :3, :3], expected_slice) + ) + + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + print(f"Saving model to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path) + print(f"Saving feature extractor to {pytorch_dump_folder_path}") + feature_extractor.save_pretrained(pytorch_dump_folder_path) + + if push_to_hub: + print("Pushing model to hub...") + model.push_to_hub( + repo_path_or_name=Path(pytorch_dump_folder_path, model_name), + organization="nielsr", + commit_message="Add model", + use_temp_dir=True, + ) + feature_extractor.push_to_hub( + repo_path_or_name=Path(pytorch_dump_folder_path, model_name), + organization="nielsr", + commit_message="Add feature extractor", + use_temp_dir=True, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--checkpoint_url", + default="https://github.com/intel-isl/DPT/releases/download/1_0/dpt_large-midas-2f21e586.pt", + type=str, + help="URL of the original DPT checkpoint you'd like to convert.", + ) + parser.add_argument( + "--pytorch_dump_folder_path", + default=None, + type=str, + required=True, + help="Path to the output PyTorch model directory.", + ) + parser.add_argument( + "--push_to_hub", + action="store_true", + ) + parser.add_argument( + "--model_name", + default="dpt-large", + type=str, + help="Name of the model, in case you're pushing to the hub.", + ) + + args = parser.parse_args() + convert_dpt_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub, args.model_name) diff --git a/src/transformers/models/dpt/feature_extraction_dpt.py b/src/transformers/models/dpt/feature_extraction_dpt.py new file mode 100644 index 0000000000..d4346b96f8 --- /dev/null +++ b/src/transformers/models/dpt/feature_extraction_dpt.py @@ -0,0 +1,202 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Feature extractor class for DPT.""" + +from typing import Optional, Union + +import numpy as np +from PIL import Image + +from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin +from ...file_utils import TensorType +from ...image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + ImageFeatureExtractionMixin, + ImageInput, + is_torch_tensor, +) +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class DPTFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin): + r""" + Constructs a DPT feature extractor. + + This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users + should refer to this superclass for more information regarding those methods. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the input to a certain `size`. + size ('int' or `Tuple(int)`, *optional*, defaults to 384): + Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an + integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize` is + set to `True`. + ensure_multiple_of (`int`, *optional*, defaults to 1): + Ensure that the input is resized to a multiple of this value. Only has an effect if `do_resize` is set to + `True`. + keep_aspect_ratio (`bool`, *optional*, defaults to `False`): + Whether to keep the aspect ratio of the input. Only has an effect if `do_resize` is set to `True`. + resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`): + An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`, + `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`. Only has an effect + if `do_resize` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether or not to normalize the input with mean and standard deviation. + image_mean (`List[int]`, defaults to `[0.5, 0.5, 0.5]`): + The sequence of means for each channel, to be used when normalizing images. + image_std (`List[int]`, defaults to `[0.5, 0.5, 0.5]`): + The sequence of standard deviations for each channel, to be used when normalizing images. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize=True, + size=384, + keep_aspect_ratio=False, + ensure_multiple_of=1, + resample=Image.BILINEAR, + do_normalize=True, + image_mean=None, + image_std=None, + **kwargs + ): + super().__init__(**kwargs) + self.do_resize = do_resize + self.size = size + self.keep_aspect_ratio = keep_aspect_ratio + self.ensure_multiple_of = ensure_multiple_of + self.resample = resample + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD + + def constrain_to_multiple_of(self, size, min_val=0, max_val=None): + y = (np.round(size / self.ensure_multiple_of) * self.ensure_multiple_of).astype(int) + + if max_val is not None and y > max_val: + y = (np.floor(size / self.ensure_multiple_of) * self.ensure_multiple_of).astype(int) + + if y < min_val: + y = (np.ceil(size / self.ensure_multiple_of) * self.ensure_multiple_of).astype(int) + + return y + + def update_size(self, image): + image = self.to_pil_image(image) + width, height = image.size + + size = self.size + + if isinstance(size, list): + size = tuple(size) + + if isinstance(size, int) or len(size) == 1: + size = (size, size) + + # determine new width and height + scale_width = size[0] / width + scale_height = size[1] / height + + if self.keep_aspect_ratio: + # scale as least as possbile + if abs(1 - scale_width) < abs(1 - scale_height): + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + else: + new_width = self.constrain_to_multiple_of(scale_width * width) + new_height = self.constrain_to_multiple_of(scale_height * height) + + return (new_width, new_height) + + def __call__( + self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs + ) -> BatchFeature: + """ + Main method to prepare for the model one or several image(s). + + + + NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass + PIL images. + + + + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a + number of channels, H and W are image height and width. + + return_tensors (`str` or [`~file_utils.TensorType`], *optional*, defaults to `'np'`): + If set, will return tensors of a particular framework. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height, + width). + """ + # Input type checking for clearer error + valid_images = False + + # Check that images has a valid type + if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images): + valid_images = True + elif isinstance(images, (list, tuple)): + if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]): + valid_images = True + + if not valid_images: + raise ValueError( + "Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example), " + "`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)." + ) + + is_batched = bool( + isinstance(images, (list, tuple)) + and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0])) + ) + + if not is_batched: + images = [images] + + # transformations (resizing + normalization) + if self.do_resize and self.size is not None: + for idx, image in enumerate(images): + size = self.update_size(image) + images[idx] = self.resize(image, size=size, resample=self.resample) + if self.do_normalize: + images = [self.normalize(image=image, mean=self.image_mean, std=self.image_std) for image in images] + + # return as BatchFeature + data = {"pixel_values": images} + encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors) + + return encoded_inputs diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py new file mode 100755 index 0000000000..ba454dc964 --- /dev/null +++ b/src/transformers/models/dpt/modeling_dpt.py @@ -0,0 +1,1132 @@ +# coding=utf-8 +# Copyright 2022 Intel Labs, OpenMMLab and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch DPT (Dense Prediction Transformers) model. + +This implementation is heavily inspired by OpenMMLab's implementation, found here: +https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/decode_heads/dpt_head.py. + +""" + + +import collections.abc +import math +from typing import List, Optional, Set, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...file_utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + replace_return_docstrings, +) +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPooling, + DepthEstimatorOutput, + SemanticSegmenterOutput, +) +from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import logging +from .configuration_dpt import DPTConfig + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "DPTConfig" +_FEAT_EXTRACTOR_FOR_DOC = "DPTFeatureExtractor" + +# Base docstring +_CHECKPOINT_FOR_DOC = "Intel/dpt-large" +_EXPECTED_OUTPUT_SHAPE = [1, 577, 1024] + + +DPT_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "Intel/dpt-large", + # See all DPT models at https://huggingface.co/models?filter=dpt +] + + +# Copied from transformers.models.vit.modeling_vit.to_2tuple +def to_2tuple(x): + if isinstance(x, collections.abc.Iterable): + return x + return (x, x) + + +class DPTViTEmbeddings(nn.Module): + """ + Construct the CLS token, position and patch embeddings. + + """ + + def __init__(self, config): + super().__init__() + + self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + self.patch_embeddings = DPTViTPatchEmbeddings( + image_size=config.image_size, + patch_size=config.patch_size, + num_channels=config.num_channels, + embed_dim=config.hidden_size, + ) + num_patches = self.patch_embeddings.num_patches + self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size)) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.config = config + + def _resize_pos_embed(self, posemb, grid_size_height, grid_size_width, start_index=1): + posemb_tok = posemb[:, :start_index] + posemb_grid = posemb[0, start_index:] + + old_grid_size = int(math.sqrt(len(posemb_grid))) + + posemb_grid = posemb_grid.reshape(1, old_grid_size, old_grid_size, -1).permute(0, 3, 1, 2) + posemb_grid = nn.functional.interpolate(posemb_grid, size=(grid_size_height, grid_size_width), mode="bilinear") + posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, grid_size_height * grid_size_width, -1) + + posemb = torch.cat([posemb_tok, posemb_grid], dim=1) + + return posemb + + def forward(self, pixel_values): + batch_size, num_channels, height, width = pixel_values.shape + + # possibly interpolate position encodings to handle varying image sizes + patch_size = self.config.patch_size + position_embeddings = self._resize_pos_embed( + self.position_embeddings, height // patch_size, width // patch_size + ) + + embeddings = self.patch_embeddings(pixel_values) + + batch_size, seq_len, _ = embeddings.size() + + # add the [CLS] token to the embedded patch tokens + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + embeddings = torch.cat((cls_tokens, embeddings), dim=1) + + # add positional encoding to each token + embeddings = embeddings + position_embeddings + + embeddings = self.dropout(embeddings) + + return embeddings + + +class DPTViTPatchEmbeddings(nn.Module): + """ + Image to Patch Embedding. + + """ + + def __init__(self, image_size=224, patch_size=16, num_channels=3, embed_dim=768): + super().__init__() + image_size = to_2tuple(image_size) + patch_size = to_2tuple(patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.image_size = image_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size) + + def forward(self, pixel_values): + batch_size, num_channels, height, width = pixel_values.shape + embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2) + return embeddings + + +# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->DPT +class DPTViTSelfAttention(nn.Module): + def __init__(self, config: DPTConfig) -> None: + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size {config.hidden_size,} is not a multiple of the number of attention " + f"heads {config.num_attention_heads}." + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + mixed_query_layer = self.query(hidden_states) + + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + query_layer = self.transpose_for_scores(mixed_query_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->DPT +class DPTViTSelfOutput(nn.Module): + """ + The residual connection is defined in DPTLayer instead of here (as is the case with other models), due to the + layernorm applied before each block. + """ + + def __init__(self, config: DPTConfig) -> None: + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +class DPTViTAttention(nn.Module): + def __init__(self, config: DPTConfig) -> None: + super().__init__() + self.attention = DPTViTSelfAttention(config) + self.output = DPTViTSelfOutput(config) + self.pruned_heads = set() + + # Copied from transformers.models.vit.modeling_vit.ViTAttention.prune_heads + def prune_heads(self, heads: Set[int]) -> None: + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + # Copied from transformers.models.vit.modeling_vit.ViTAttention.forward + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + self_outputs = self.attention(hidden_states, head_mask, output_attentions) + + attention_output = self.output(self_outputs[0], hidden_states) + + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViT->DPT +class DPTViTIntermediate(nn.Module): + def __init__(self, config: DPTConfig) -> None: + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + + return hidden_states + + +# Copied from transformers.models.vit.modeling_vit.ViTOutput with ViT->DPT +class DPTViTOutput(nn.Module): + def __init__(self, config: DPTConfig) -> None: + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + hidden_states = hidden_states + input_tensor + + return hidden_states + + +# copied from transformers.models.vit.modeling_vit.ViTLayer with ViTConfig->DPTConfig, ViTAttention->DPTViTAttention, ViTIntermediate->DPTViTIntermediate, ViTOutput->DPTViTOutput +class DPTViTLayer(nn.Module): + """This corresponds to the Block class in the timm implementation.""" + + def __init__(self, config: DPTConfig) -> None: + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = DPTViTAttention(config) + self.intermediate = DPTViTIntermediate(config) + self.output = DPTViTOutput(config) + self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + self_attention_outputs = self.attention( + self.layernorm_before(hidden_states), # in ViT, layernorm is applied before self-attention + head_mask, + output_attentions=output_attentions, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + # first residual connection + hidden_states = attention_output + hidden_states + + # in ViT, layernorm is also applied after self-attention + layer_output = self.layernorm_after(hidden_states) + layer_output = self.intermediate(layer_output) + + # second residual connection is done here + layer_output = self.output(layer_output, hidden_states) + + outputs = (layer_output,) + outputs + + return outputs + + +# copied from transformers.models.vit.modeling_vit.ViTEncoder with ViTConfig -> DPTConfig, ViTLayer->DPTViTLayer +class DPTViTEncoder(nn.Module): + def __init__(self, config: DPTConfig) -> None: + super().__init__() + self.config = config + self.layer = nn.ModuleList([DPTViTLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, BaseModelOutput]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + layer_head_mask, + ) + else: + layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class DPTReassembleStage(nn.Module): + """ + This class reassembles the hidden states of the backbone into image-like feature representations at various + resolutions. + + This happens in 3 stages: + 1. Map the N + 1 tokens to a set of N tokens, by taking into account the readout ([CLS]) token according to + `config.readout_type`. + 2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`. + 3. Resizing the spatial dimensions (height, width). + + Args: + config (`[DPTConfig]`): + Model configuration class defining the model architecture. + """ + + def __init__(self, config): + super().__init__() + + self.config = config + self.layers = nn.ModuleList() + for i, factor in zip(range(len(config.neck_hidden_sizes)), config.reassemble_factors): + self.layers.append(DPTReassembleLayer(config, channels=config.neck_hidden_sizes[i], factor=factor)) + + if config.readout_type == "project": + self.readout_projects = nn.ModuleList() + for _ in range(len(config.neck_hidden_sizes)): + self.readout_projects.append( + nn.Sequential(nn.Linear(2 * config.hidden_size, config.hidden_size), ACT2FN[config.hidden_act]) + ) + + def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]: + """ + Args: + hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`): + List of hidden states from the backbone. + """ + out = [] + + for i, hidden_state in enumerate(hidden_states): + # reshape to (B, C, H, W) + hidden_state, cls_token = hidden_state[:, 1:], hidden_state[:, 0] + batch_size, sequence_length, num_channels = hidden_state.shape + size = int(math.sqrt(sequence_length)) + hidden_state = hidden_state.reshape(batch_size, size, size, num_channels) + hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous() + + feature_shape = hidden_state.shape + if self.config.readout_type == "project": + # reshape to (B, H*W, C) + hidden_state = hidden_state.flatten(2).permute((0, 2, 1)) + readout = cls_token.unsqueeze(1).expand_as(hidden_state) + # concatenate the readout token to the hidden states and project + hidden_state = self.readout_projects[i](torch.cat((hidden_state, readout), -1)) + # reshape back to (B, C, H, W) + hidden_state = hidden_state.permute(0, 2, 1).reshape(feature_shape) + elif self.config.readout_type == "add": + hidden_state = hidden_state.flatten(2) + cls_token.unsqueeze(-1) + hidden_state = hidden_state.reshape(feature_shape) + hidden_state = self.layers[i](hidden_state) + out.append(hidden_state) + + return out + + +class DPTReassembleLayer(nn.Module): + def __init__(self, config, channels, factor): + super().__init__() + # projection + self.projection = nn.Conv2d(in_channels=config.hidden_size, out_channels=channels, kernel_size=1) + + # up/down sampling depending on factor + if factor > 1: + self.resize = nn.ConvTranspose2d(channels, channels, kernel_size=factor, stride=factor, padding=0) + elif factor == 1: + self.resize = nn.Identity() + elif factor < 1: + # so should downsample + self.resize = nn.Conv2d(channels, channels, kernel_size=3, stride=int(1 / factor), padding=1) + + def forward(self, hidden_state): + hidden_state = self.projection(hidden_state) + hidden_state = self.resize(hidden_state) + return hidden_state + + +class DPTFeatureFusionStage(nn.Module): + def __init__(self, config): + super().__init__() + self.layers = nn.ModuleList() + for _ in range(len(config.neck_hidden_sizes)): + self.layers.append(DPTFeatureFusionLayer(config)) + + def forward(self, hidden_states): + # reversing the hidden_states, we start from the last + hidden_states = hidden_states[::-1] + + fused_hidden_states = [] + # first layer only uses the last hidden_state + fused_hidden_state = self.layers[0](hidden_states[0]) + fused_hidden_states.append(fused_hidden_state) + # looping from the last layer to the second + for hidden_state, layer in zip(hidden_states[1:], self.layers[1:]): + fused_hidden_state = layer(fused_hidden_state, hidden_state) + fused_hidden_states.append(fused_hidden_state) + + return fused_hidden_states + + +class DPTPreActResidualLayer(nn.Module): + """ + ResidualConvUnit, pre-activate residual unit. + + Args: + config (`[DPTConfig]`): + Model configuration class defining the model architecture. + """ + + def __init__(self, config): + super().__init__() + + self.use_batch_norm = config.use_batch_norm_in_fusion_residual + self.activation1 = ACT2FN["relu"] + self.convolution1 = nn.Conv2d( + config.fusion_hidden_size, + config.fusion_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=not self.use_batch_norm, + ) + + self.activation2 = ACT2FN["relu"] + self.convolution2 = nn.Conv2d( + config.fusion_hidden_size, + config.fusion_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=not self.use_batch_norm, + ) + + if self.use_batch_norm: + self.batch_norm1 = nn.BatchNorm2d(config.fusion_hidden_size) + self.batch_norm2 = nn.BatchNorm2d(config.fusion_hidden_size) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + residual = hidden_state + hidden_state = self.activation1(hidden_state) + + hidden_state = self.convolution1(hidden_state) + + if self.use_batch_norm: + hidden_state = self.batch_norm1(hidden_state) + + hidden_state = self.activation2(hidden_state) + hidden_state = self.convolution2(hidden_state) + + if self.use_batch_norm: + hidden_state = self.batch_norm2(hidden_state) + + return hidden_state + residual + + +class DPTFeatureFusionLayer(nn.Module): + """Feature fusion layer, merges feature maps from different stages. + + Args: + config (`[DPTConfig]`): + Model configuration class defining the model architecture. + align_corners (`bool`, *optional*, defaults to `True`): + The align_corner setting for bilinear upsample. + """ + + def __init__(self, config, align_corners=True): + super().__init__() + + self.align_corners = align_corners + + self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True) + + self.residual_layer1 = DPTPreActResidualLayer(config) + self.residual_layer2 = DPTPreActResidualLayer(config) + + def forward(self, hidden_state, residual=None): + if residual is not None: + if hidden_state.shape != residual.shape: + residual = nn.functional.interpolate( + residual, size=(hidden_state.shape[2], hidden_state.shape[3]), mode="bilinear", align_corners=False + ) + hidden_state = hidden_state + self.residual_layer1(residual) + + hidden_state = self.residual_layer2(hidden_state) + hidden_state = nn.functional.interpolate( + hidden_state, scale_factor=2, mode="bilinear", align_corners=self.align_corners + ) + hidden_state = self.projection(hidden_state) + + return hidden_state + + +class DPTPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = DPTConfig + base_model_prefix = "dpt" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, DPTViTEncoder): + module.gradient_checkpointing = value + + +DPT_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it + as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`ViTConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +DPT_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`ViTFeatureExtractor`]. See + [`ViTFeatureExtractor.__call__`] for details. + + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare DPT Model transformer outputting raw hidden-states without any specific head on top.", + DPT_START_DOCSTRING, +) +class DPTModel(DPTPreTrainedModel): + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.config = config + + # vit encoder + self.embeddings = DPTViTEmbeddings(config) + self.encoder = DPTViTEncoder(config) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.pooler = DPTViTPooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.patch_embeddings + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(DPT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + processor_class=_FEAT_EXTRACTOR_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + modality="vision", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def forward( + self, + pixel_values, + head_mask=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings(pixel_values) + + encoder_outputs = self.encoder( + embedding_output, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = encoder_outputs[0] + sequence_output = self.layernorm(sequence_output) + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +# Copied from transformers.models.vit.modeling_vit.ViTPooler with ViT->DPT +class DPTViTPooler(nn.Module): + def __init__(self, config: DPTConfig): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class DPTNeck(nn.Module): + """ + DPTNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as + input and produces another list of tensors as output. For DPT, it includes 2 stages: + + * DPTReassembleStage + * DPTFeatureFusionStage. + + Args: + config (dict): config dict. + """ + + def __init__(self, config): + super().__init__() + + self.config = config + + # postprocessing + self.reassemble_stage = DPTReassembleStage(config) + self.convs = nn.ModuleList() + for channel in config.neck_hidden_sizes: + self.convs.append(nn.Conv2d(channel, config.fusion_hidden_size, kernel_size=3, padding=1, bias=False)) + + # fusion + self.fusion_stage = DPTFeatureFusionStage(config) + + def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]: + if not isinstance(hidden_states, list): + raise ValueError("hidden_states should be a list of tensors") + + if len(hidden_states) != len(self.config.neck_hidden_sizes): + raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.") + + # postprocess hidden states + features = self.reassemble_stage(hidden_states) + + features = [self.convs[i](feature) for i, feature in enumerate(features)] + + # fusion blocks + output = self.fusion_stage(features) + + return output + + +class DPTDepthEstimationHead(nn.Module): + """ + Output head head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples + the predictions to the input resolution after the first convolutional layer (details can be found in the paper's + supplementary material). + """ + + def __init__(self, config): + super().__init__() + + self.config = config + + features = config.fusion_hidden_size + self.head = nn.Sequential( + nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1), + nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True), + nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1), + ACT2FN["relu"], + nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), + ACT2FN["relu"], + ) + + def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor: + # use last features + hidden_states = hidden_states[self.config.head_in_index] + + predicted_depth = self.head(hidden_states) + + predicted_depth = predicted_depth.squeeze(dim=1) + + return predicted_depth + + +@add_start_docstrings( + """ + DPT Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2. + """, + DPT_START_DOCSTRING, +) +class DPTForDepthEstimation(DPTPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.dpt = DPTModel(config, add_pooling_layer=False) + + # Neck + self.neck = DPTNeck(config) + + # Depth estimation head + self.head = DPTDepthEstimationHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(DPT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=DepthEstimatorOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values, + head_mask=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): + Ground truth depth estimation maps for computing the loss. + + Returns: + + Examples: + ```python + >>> from transformers import DPTFeatureExtractor, DPTForDepthEstimation + >>> import torch + >>> import numpy as np + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large") + >>> model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large") + + >>> # prepare image for the model + >>> inputs = feature_extractor(images=image, return_tensors="pt") + + >>> with torch.no_grad(): + ... outputs = model(**inputs) + ... predicted_depth = outputs.predicted_depth + + >>> # interpolate to original size + >>> prediction = torch.nn.functional.interpolate( + ... predicted_depth.unsqueeze(1), + ... size=image.size[::-1], + ... mode="bicubic", + ... align_corners=False, + ... ) + + >>> # visualize the prediction + >>> output = prediction.squeeze().cpu().numpy() + >>> formatted = (output * 255 / np.max(output)).astype("uint8") + >>> depth = Image.fromarray(formatted) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + outputs = self.dpt( + pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=True, # we need the intermediate hidden states + return_dict=return_dict, + ) + + hidden_states = outputs.hidden_states if return_dict else outputs[2] + + # only keep certain features based on config.backbone_out_indices + # note that the hidden_states also include the initial embeddings + hidden_states = [ + feature for idx, feature in enumerate(hidden_states[1:]) if idx in self.config.backbone_out_indices + ] + + hidden_states = self.neck(hidden_states) + + predicted_depth = self.head(hidden_states) + + loss = None + if labels is not None: + raise NotImplementedError("Training is not implemented yet") + + if not return_dict: + if output_hidden_states: + output = (predicted_depth,) + outputs[2:] + else: + output = (predicted_depth,) + outputs[3:] + return ((loss,) + output) if loss is not None else output + + return DepthEstimatorOutput( + loss=loss, + predicted_depth=predicted_depth, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=outputs.attentions, + ) + + +class DPTSemanticSegmentationHead(nn.Module): + def __init__(self, config): + super().__init__() + + self.config = config + + features = config.fusion_hidden_size + self.head = nn.Sequential( + nn.Conv2d(features, features, kernel_size=3, padding=1, bias=False), + nn.BatchNorm2d(features), + ACT2FN["relu"], + nn.Dropout(config.semantic_classifier_dropout), + nn.Conv2d(features, config.num_labels, kernel_size=1), + nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True), + ) + + def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor: + # use last features + hidden_states = hidden_states[self.config.head_in_index] + + logits = self.head(hidden_states) + + return logits + + +class DPTAuxiliaryHead(nn.Module): + def __init__(self, config): + super().__init__() + + features = config.fusion_hidden_size + self.head = nn.Sequential( + nn.Conv2d(features, features, kernel_size=3, padding=1, bias=False), + nn.BatchNorm2d(features), + ACT2FN["relu"], + nn.Dropout(0.1, False), + nn.Conv2d(features, config.num_labels, kernel_size=1), + ) + + def forward(self, hidden_states): + logits = self.head(hidden_states) + + return logits + + +@add_start_docstrings( + """ + DPT Model with a semantic segmentation head on top e.g. for ADE20k, CityScapes. + """, + DPT_START_DOCSTRING, +) +class DPTForSemanticSegmentation(DPTPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.dpt = DPTModel(config, add_pooling_layer=False) + + # Neck + self.neck = DPTNeck(config) + + # Segmentation head(s) + self.head = DPTSemanticSegmentationHead(config) + self.auxiliary_head = DPTAuxiliaryHead(config) if config.use_auxiliary_head else None + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(DPT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=SemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values=None, + head_mask=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): + Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy). + + Returns: + + Examples: + ```python + >>> from transformers import DPTFeatureExtractor, DPTForSemanticSegmentation + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large-ade") + >>> model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade") + + >>> inputs = feature_extractor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> logits = outputs.logits + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + outputs = self.dpt( + pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=True, # we need the intermediate hidden states + return_dict=return_dict, + ) + + hidden_states = outputs.hidden_states if return_dict else outputs[2] + + # only keep certain features based on config.backbone_out_indices + # note that the hidden_states also include the initial embeddings + hidden_states = [ + feature for idx, feature in enumerate(hidden_states[1:]) if idx in self.config.backbone_out_indices + ] + + hidden_states = self.neck(hidden_states) + + logits = self.head(hidden_states) + + auxiliary_logits = None + if self.auxiliary_head is not None: + auxiliary_logits = self.auxiliary_head(hidden_states[-1]) + + loss = None + if labels is not None: + if self.config.num_labels == 1: + raise ValueError("The number of labels should be greater than one") + else: + # upsample logits to the images' original size + upsampled_logits = nn.functional.interpolate( + logits, size=labels.shape[-2:], mode="bilinear", align_corners=False + ) + if auxiliary_logits is not None: + upsampled_auxiliary_logits = nn.functional.interpolate( + auxiliary_logits, size=labels.shape[-2:], mode="bilinear", align_corners=False + ) + # compute weighted loss + loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index) + main_loss = loss_fct(upsampled_logits, labels) + auxiliary_loss = loss_fct(upsampled_auxiliary_logits, labels) + loss = main_loss + self.config.auxiliary_loss_weight * auxiliary_loss + + if not return_dict: + if output_hidden_states: + output = (logits,) + outputs[2:] + else: + output = (logits,) + outputs[3:] + return ((loss,) + output) if loss is not None else output + + return SemanticSegmenterOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=outputs.attentions, + ) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 0ce70b2986..2bd71277b5 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -1604,6 +1604,37 @@ class DPRReader(metaclass=DummyObject): requires_backends(self, ["torch"]) +DPT_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class DPTForDepthEstimation(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class DPTForSemanticSegmentation(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class DPTModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class DPTPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index f0e19d35f3..6ffeeb52b3 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -52,6 +52,13 @@ class DetrFeatureExtractor(metaclass=DummyObject): requires_backends(self, ["vision"]) +class DPTFeatureExtractor(metaclass=DummyObject): + _backends = ["vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + class GLPNFeatureExtractor(metaclass=DummyObject): _backends = ["vision"] diff --git a/tests/dpt/__init__.py b/tests/dpt/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/dpt/test_feature_extraction_dpt.py b/tests/dpt/test_feature_extraction_dpt.py new file mode 100644 index 0000000000..83abb59eec --- /dev/null +++ b/tests/dpt/test_feature_extraction_dpt.py @@ -0,0 +1,188 @@ +# coding=utf-8 +# Copyright 2022 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +import numpy as np + +from transformers.file_utils import is_torch_available, is_vision_available +from transformers.testing_utils import require_torch, require_vision + +from ..test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + from PIL import Image + + from transformers import DPTFeatureExtractor + + +class DPTFeatureExtractionTester(unittest.TestCase): + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=18, + do_normalize=True, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + ): + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + + def prepare_feat_extract_dict(self): + return { + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_normalize": self.do_normalize, + "do_resize": self.do_resize, + "size": self.size, + } + + +@require_torch +@require_vision +class DPTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase): + + feature_extraction_class = DPTFeatureExtractor if is_vision_available() else None + + def setUp(self): + self.feature_extract_tester = DPTFeatureExtractionTester(self) + + @property + def feat_extract_dict(self): + return self.feature_extract_tester.prepare_feat_extract_dict() + + def test_feat_extract_properties(self): + feature_extractor = self.feature_extraction_class(**self.feat_extract_dict) + self.assertTrue(hasattr(feature_extractor, "image_mean")) + self.assertTrue(hasattr(feature_extractor, "image_std")) + self.assertTrue(hasattr(feature_extractor, "do_normalize")) + self.assertTrue(hasattr(feature_extractor, "do_resize")) + self.assertTrue(hasattr(feature_extractor, "size")) + + def test_call_pil(self): + # Initialize feature_extractor + feature_extractor = self.feature_extraction_class(**self.feat_extract_dict) + # create random PIL images + image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False) + for image in image_inputs: + self.assertIsInstance(image, Image.Image) + + # Test not batched input + encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values + self.assertEqual( + encoded_images.shape, + ( + 1, + self.feature_extract_tester.num_channels, + self.feature_extract_tester.size, + self.feature_extract_tester.size, + ), + ) + + # Test batched + encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values + self.assertEqual( + encoded_images.shape, + ( + self.feature_extract_tester.batch_size, + self.feature_extract_tester.num_channels, + self.feature_extract_tester.size, + self.feature_extract_tester.size, + ), + ) + + def test_call_numpy(self): + # Initialize feature_extractor + feature_extractor = self.feature_extraction_class(**self.feat_extract_dict) + # create random numpy tensors + image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True) + for image in image_inputs: + self.assertIsInstance(image, np.ndarray) + + # Test not batched input + encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values + self.assertEqual( + encoded_images.shape, + ( + 1, + self.feature_extract_tester.num_channels, + self.feature_extract_tester.size, + self.feature_extract_tester.size, + ), + ) + + # Test batched + encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values + self.assertEqual( + encoded_images.shape, + ( + self.feature_extract_tester.batch_size, + self.feature_extract_tester.num_channels, + self.feature_extract_tester.size, + self.feature_extract_tester.size, + ), + ) + + def test_call_pytorch(self): + # Initialize feature_extractor + feature_extractor = self.feature_extraction_class(**self.feat_extract_dict) + # create random PyTorch tensors + image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True) + for image in image_inputs: + self.assertIsInstance(image, torch.Tensor) + + # Test not batched input + encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values + self.assertEqual( + encoded_images.shape, + ( + 1, + self.feature_extract_tester.num_channels, + self.feature_extract_tester.size, + self.feature_extract_tester.size, + ), + ) + + # Test batched + encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values + self.assertEqual( + encoded_images.shape, + ( + self.feature_extract_tester.batch_size, + self.feature_extract_tester.num_channels, + self.feature_extract_tester.size, + self.feature_extract_tester.size, + ), + ) diff --git a/tests/dpt/test_modeling_dpt.py b/tests/dpt/test_modeling_dpt.py new file mode 100644 index 0000000000..93601a5723 --- /dev/null +++ b/tests/dpt/test_modeling_dpt.py @@ -0,0 +1,393 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the PyTorch DPT model. """ + + +import inspect +import unittest + +from transformers import DPTConfig +from transformers.file_utils import is_torch_available, is_vision_available +from transformers.models.auto import get_values +from transformers.testing_utils import require_torch, require_vision, slow, torch_device + +from ..test_configuration_common import ConfigTester +from ..test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import MODEL_MAPPING, DPTForDepthEstimation, DPTForSemanticSegmentation, DPTModel + from transformers.models.dpt.modeling_dpt import DPT_PRETRAINED_MODEL_ARCHIVE_LIST + + +if is_vision_available(): + from PIL import Image + + from transformers import DPTFeatureExtractor + + +class DPTModelTester: + def __init__( + self, + parent, + batch_size=2, + image_size=32, + patch_size=16, + num_channels=3, + is_training=True, + use_labels=True, + hidden_size=32, + num_hidden_layers=4, + backbone_out_indices=[0, 1, 2, 3], + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + initializer_range=0.02, + num_labels=3, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.backbone_out_indices = backbone_out_indices + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.num_labels = num_labels + self.scope = scope + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return DPTConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + backbone_out_indices=self.backbone_out_indices, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + is_decoder=False, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = DPTModel(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token) + num_patches = (config.image_size // config.patch_size) ** 2 + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size)) + + def create_and_check_for_depth_estimation(self, config, pixel_values, labels): + config.num_labels = self.num_labels + model = DPTForDepthEstimation(config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.image_size, self.image_size)) + + def create_and_check_for_semantic_segmentation(self, config, pixel_values, labels): + config.num_labels = self.num_labels + model = DPTForSemanticSegmentation(config) + model.to(torch_device) + model.eval() + result = model(pixel_values, labels=labels) + self.parent.assertEqual( + result.logits.shape, (self.batch_size, self.num_labels, self.image_size, self.image_size) + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class DPTModelTest(ModelTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as DPT does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (DPTModel, DPTForDepthEstimation, DPTForSemanticSegmentation) if is_torch_available() else () + + test_pruning = False + test_torchscript = False + test_resize_embeddings = False + test_head_masking = False + + def setUp(self): + self.model_tester = DPTModelTester(self) + self.config_tester = ConfigTester(self, config_class=DPTConfig, has_text_modality=False, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_inputs_embeds(self): + # DPT does not use inputs_embeds + pass + + def test_model_common_attributes(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_depth_estimation(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs) + + def test_for_semantic_segmentation(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_semantic_segmentation(*config_and_inputs) + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + # in DPT, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token) + num_patches = (config.image_size // config.patch_size) ** 2 + seq_len = num_patches + 1 + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + self.assertEqual(len(outputs.attentions), self.model_tester.num_hidden_layers) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, seq_len, seq_len], + ) + out_len = len(outputs) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + self.assertEqual(out_len + 1, len(outputs)) + + self_attentions = outputs.attentions + + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, seq_len, seq_len], + ) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.hidden_states + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + ) + self.assertEqual(len(hidden_states), expected_num_layers) + + # DPT has a different seq_length + num_patches = (config.image_size // config.patch_size) ** 2 + seq_len = num_patches + 1 + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [seq_len, self.model_tester.hidden_size], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + def test_training(self): + for model_class in self.all_model_classes: + if model_class.__name__ == "DPTForDepthEstimation": + continue + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + if model_class in get_values(MODEL_MAPPING): + continue + + model = model_class(config) + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + def test_training_gradient_checkpointing(self): + for model_class in self.all_model_classes: + if model_class.__name__ == "DPTForDepthEstimation": + continue + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.use_cache = False + config.return_dict = True + + if model_class in get_values(MODEL_MAPPING) or not model_class.supports_gradient_checkpointing: + continue + model = model_class(config) + model.to(torch_device) + model.gradient_checkpointing_enable() + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + @slow + def test_model_from_pretrained(self): + for model_name in DPT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = DPTModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +@slow +class DPTModelIntegrationTest(unittest.TestCase): + def test_inference_depth_estimation(self): + feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large") + model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large").to(torch_device) + + image = prepare_img() + inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + predicted_depth = outputs.predicted_depth + + # verify the predicted depth + expected_shape = torch.Size((1, 384, 384)) + self.assertEqual(predicted_depth.shape, expected_shape) + + expected_slice = torch.tensor( + [[6.3199, 6.3629, 6.4148], [6.3850, 6.3615, 6.4166], [6.3519, 6.3176, 6.3575]] + ).to(torch_device) + + self.assertTrue(torch.allclose(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4)) + + def test_inference_semantic_segmentation(self): + feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large-ade") + model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade").to(torch_device) + + image = prepare_img() + inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the logits + expected_shape = torch.Size((1, 150, 480, 480)) + self.assertEqual(outputs.logits.shape, expected_shape) + + expected_slice = torch.tensor( + [[4.0480, 4.2420, 4.4360], [4.3124, 4.5693, 4.8261], [4.5768, 4.8965, 5.2163]] + ).to(torch_device) + + self.assertTrue(torch.allclose(outputs.logits[0, 0, :3, :3], expected_slice, atol=1e-4)) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 8e24175552..d256bcd10e 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -79,6 +79,7 @@ if is_torch_available(): MODEL_FOR_MULTIPLE_CHOICE_MAPPING, MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING, MODEL_FOR_QUESTION_ANSWERING_MAPPING, + MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING, MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, @@ -177,6 +178,11 @@ class ModelTesterMixin: inputs_dict["bool_masked_pos"] = torch.zeros( (self.model_tester.batch_size, num_patches**2), dtype=torch.long, device=torch_device ) + elif model_class in get_values(MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING): + batch_size, num_channels, height, width = inputs_dict["pixel_values"].shape + inputs_dict["labels"] = torch.zeros( + [self.model_tester.batch_size, height, width], device=torch_device + ).long() return inputs_dict diff --git a/utils/check_repo.py b/utils/check_repo.py index da50850b5e..5f81c3bcfc 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -117,6 +117,7 @@ TEST_FILES_WITH_NO_COMMON_TESTS = [ # should **not** be the rule. IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [ # models to ignore for model xxx mapping + "DPTForDepthEstimation", "DecisionTransformerGPT2Model", "GLPNForDepthEstimation", "ViltForQuestionAnswering", diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt index b5d9f8570c..5e2435ef46 100644 --- a/utils/documentation_tests.txt +++ b/utils/documentation_tests.txt @@ -16,6 +16,7 @@ src/transformers/models/blenderbot_small/modeling_blenderbot_small.py src/transformers/models/convnext/modeling_convnext.py src/transformers/models/data2vec/modeling_data2vec_audio.py src/transformers/models/deit/modeling_deit.py +src/transformers/models/dpt/modeling_dpt.py src/transformers/models/glpn/modeling_glpn.py src/transformers/models/hubert/modeling_hubert.py src/transformers/models/marian/modeling_marian.py @@ -48,4 +49,4 @@ src/transformers/models/vit_mae/modeling_vit_mae.py src/transformers/models/wav2vec2/modeling_wav2vec2.py src/transformers/models/wav2vec2/tokenization_wav2vec2.py src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py -src/transformers/models/wavlm/modeling_wavlm.py +src/transformers/models/wavlm/modeling_wavlm.py \ No newline at end of file