From 07c54413ac24f891fc37920f6c61ad8b7b035dc3 Mon Sep 17 00:00:00 2001 From: Shehan Munasinghe Date: Fri, 2 Jun 2023 15:07:02 +0530 Subject: [PATCH] Add MobileViTv2 (#22820) * generated code from add-new-model-like * Add code for modeling, config, and weight conversion * add tests for image-classification, update modeling and config * add code, tests for semantic-segmentation * make style, make quality, make fix-copies * make fix-copies * Update modeling_mobilevitv2.py fix bugs * Update _toctree.yml * update modeling, config fix bugs * Edit docs - fix bug MobileViTv2v2 -> MobileViTv2 * Update mobilevitv2.mdx * update docstrings * Update configuration_mobilevitv2.py make style * Update convert_mlcvnets_to_pytorch.py remove unused options * Update convert_mlcvnets_to_pytorch.py make style * Add suggestions from code review Co-Authored-By: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * make style, make quality * Add suggestions from code review Co-Authored-By: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Add suggestions from code review Remove MobileViTv2ImageProcessor Co-Authored-By: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * make style * Add suggestions from code review Rename MobileViTv2 -> MobileViTV2 Co-Authored-By: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Add suggestions from code review Co-Authored-By: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update modeling_mobilevitv2.py make style * Update serialization.mdx * Update modeling_mobilevitv2.py --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- README.md | 1 + README_es.md | 1 + README_hd.md | 1 + README_ja.md | 1 + README_ko.md | 1 + README_zh-hans.md | 3 +- README_zh-hant.md | 1 + docs/source/en/_toctree.yml | 2 + docs/source/en/index.mdx | 2 + docs/source/en/model_doc/mobilevitv2.mdx | 53 + docs/source/en/tasks/image_classification.mdx | 2 +- .../source/en/tasks/semantic_segmentation.mdx | 2 +- src/transformers/__init__.py | 18 + src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 3 + .../models/auto/image_processing_auto.py | 1 + src/transformers/models/auto/modeling_auto.py | 3 + .../models/mobilevitv2/__init__.py | 71 ++ .../mobilevitv2/configuration_mobilevitv2.py | 168 +++ .../convert_mlcvnets_to_pytorch.py | 326 +++++ .../mobilevitv2/modeling_mobilevitv2.py | 1044 +++++++++++++++++ src/transformers/utils/dummy_pt_objects.py | 31 + tests/models/mobilevitv2/__init__.py | 0 .../mobilevitv2/test_modeling_mobilevitv2.py | 383 ++++++ 24 files changed, 2116 insertions(+), 3 deletions(-) create mode 100644 docs/source/en/model_doc/mobilevitv2.mdx create mode 100644 src/transformers/models/mobilevitv2/__init__.py create mode 100644 src/transformers/models/mobilevitv2/configuration_mobilevitv2.py create mode 100644 src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py create mode 100644 src/transformers/models/mobilevitv2/modeling_mobilevitv2.py create mode 100644 tests/models/mobilevitv2/__init__.py create mode 100644 tests/models/mobilevitv2/test_modeling_mobilevitv2.py diff --git a/README.md b/README.md index c6371abd7b..7ae0e7ec2b 100644 --- a/README.md +++ b/README.md @@ -406,6 +406,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam. 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen. 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari. +1. **[MobileViTV2](https://huggingface.co/docs/transformers/main/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari. 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. diff --git a/README_es.md b/README_es.md index bd503bb717..493c56c385 100644 --- a/README_es.md +++ b/README_es.md @@ -381,6 +381,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam. 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen. 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari. +1. **[MobileViTV2](https://huggingface.co/docs/transformers/main/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari. 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. diff --git a/README_hd.md b/README_hd.md index 731e9975b5..ca28c89a0a 100644 --- a/README_hd.md +++ b/README_hd.md @@ -353,6 +353,7 @@ conda install -c huggingface transformers 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam. 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen. 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple से) साथ में कागज [MobileViT: लाइट-वेट, जनरल-पर्पस, और मोबाइल-फ्रेंडली विजन ट्रांसफॉर्मर] (https://arxiv.org/abs/2110.02178) सचिन मेहता और मोहम्मद रस्तगरी द्वारा पोस्ट किया गया। +1. **[MobileViTV2](https://huggingface.co/docs/transformers/main/model_doc/mobilevitv2)** (Apple से) Sachin Mehta and Mohammad Rastegari. द्वाराअनुसंधान पत्र [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) के साथ जारी किया गया 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI से) साथ वाला पेपर [mT5: एक व्यापक बहुभाषी पूर्व-प्रशिक्षित टेक्स्ट-टू-टेक्स्ट ट्रांसफॉर्मर]( https://arxiv.org/abs/2010.11934) लिंटिंग ज़ू, नोआ कॉन्सटेंट, एडम रॉबर्ट्स, मिहिर काले, रामी अल-रफू, आदित्य सिद्धांत, आदित्य बरुआ, कॉलिन रैफेल द्वारा पोस्ट किया गया। 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. diff --git a/README_ja.md b/README_ja.md index 9559f4d85e..001ef70b23 100644 --- a/README_ja.md +++ b/README_ja.md @@ -415,6 +415,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (Google Inc. から) Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam から公開された研究論文: [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (Google Inc. から) Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen から公開された研究論文: [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple から) Sachin Mehta and Mohammad Rastegari から公開された研究論文: [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) +1. **[MobileViTV2](https://huggingface.co/docs/transformers/main/model_doc/mobilevitv2)** (Apple から) Sachin Mehta and Mohammad Rastegari. から公開された研究論文 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research から) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu から公開された研究論文: [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI から) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel から公開された研究論文: [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (RUC AI Box から) Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen から公開された研究論文: [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) diff --git a/README_ko.md b/README_ko.md index f4203e6711..b6416ca2ae 100644 --- a/README_ko.md +++ b/README_ko.md @@ -330,6 +330,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (Google Inc. 에서) Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam 의 [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) 논문과 함께 발표했습니다. 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (Google Inc. 에서) Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen 의 [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) 논문과 함께 발표했습니다. 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple 에서) Sachin Mehta and Mohammad Rastegari 의 [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) 논문과 함께 발표했습니다. +1. **[MobileViTV2](https://huggingface.co/docs/transformers/main/model_doc/mobilevitv2)** (Apple 에서 제공)은 Sachin Mehta and Mohammad Rastegari.의 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680)논문과 함께 발표했습니다. 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research 에서) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 의 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 논문과 함께 발표했습니다. 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI 에서) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 의 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 논문과 함께 발표했습니다. 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (RUC AI Box 에서) Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 의 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 논문과 함께 발표했습니다. diff --git a/README_zh-hans.md b/README_zh-hans.md index 2601f280e7..aed58c1f80 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -340,7 +340,7 @@ conda install -c huggingface transformers 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** 用 [OPUS](http://opus.nlpl.eu/) 数据训练的机器翻译模型由 Jörg Tiedemann 发布。[Marian Framework](https://marian-nmt.github.io/) 由微软翻译团队开发。 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (来自 Microsoft Research Asia) 伴随论文 [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) 由 Junlong Li, Yiheng Xu, Lei Cui, Furu Wei 发布。 1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (来自 FAIR and UIUC) 伴随论文 [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) 由 Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar 发布。 -1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov +1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov 1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (来自 Google AI) 伴随论文 [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) 由 Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos 发布。 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) 由 Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer 发布。 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) 由 Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan 发布。 @@ -354,6 +354,7 @@ conda install -c huggingface transformers 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (来自 Google Inc.) 伴随论文 [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) 由 Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam 发布。 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (来自 Google Inc.) 伴随论文 [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) 由 Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen 发布。 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (来自 Apple) 伴随论文 [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) 由 Sachin Mehta and Mohammad Rastegari 发布。 +1. **[MobileViTV2](https://huggingface.co/docs/transformers/main/model_doc/mobilevitv2)** (来自 Apple) 伴随论文 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) 由 Sachin Mehta and Mohammad Rastegari 发布。 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (来自 Microsoft Research) 伴随论文 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 由 Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 发布。 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (来自 中国人民大学 AI Box) 伴随论文 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 由 Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 发布。 diff --git a/README_zh-hant.md b/README_zh-hant.md index cd37ca9d14..6d2cae9292 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -366,6 +366,7 @@ conda install -c huggingface transformers 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam. 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen. 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari. +1. **[MobileViTV2](https://huggingface.co/docs/transformers/main/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari. 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 8b43fb5177..60f98607bf 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -494,6 +494,8 @@ title: MobileNetV2 - local: model_doc/mobilevit title: MobileViT + - local: model_doc/mobilevitv2 + title: MobileViTV2 - local: model_doc/nat title: NAT - local: model_doc/poolformer diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx index edf5bc00b3..ee47a3ab5e 100644 --- a/docs/source/en/index.mdx +++ b/docs/source/en/index.mdx @@ -167,6 +167,7 @@ The documentation is organized into five sections: 1. **[MobileNetV1](model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam. 1. **[MobileNetV2](model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen. 1. **[MobileViT](model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari. +1. **[MobileViTV2](model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari. 1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. 1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[MVP](model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. @@ -369,6 +370,7 @@ Flax), PyTorch, and/or TensorFlow. | MobileNetV1 | ❌ | ❌ | ✅ | ❌ | ❌ | | MobileNetV2 | ❌ | ❌ | ✅ | ❌ | ❌ | | MobileViT | ❌ | ❌ | ✅ | ✅ | ❌ | +| MobileViTV2 | ❌ | ❌ | ✅ | ❌ | ❌ | | MPNet | ✅ | ✅ | ✅ | ✅ | ❌ | | MT5 | ✅ | ✅ | ✅ | ✅ | ✅ | | MVP | ✅ | ✅ | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/mobilevitv2.mdx b/docs/source/en/model_doc/mobilevitv2.mdx new file mode 100644 index 0000000000..1cfbb6808a --- /dev/null +++ b/docs/source/en/model_doc/mobilevitv2.mdx @@ -0,0 +1,53 @@ + + +# MobileViTV2 + +## Overview + +The MobileViTV2 model was proposed in [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari. + +MobileViTV2 is the second version of MobileViT, constructed by replacing the multi-headed self-attention in MobileViT with separable self-attention. + +The abstract from the paper is the following: + +*Mobile vision transformers (MobileViT) can achieve state-of-the-art performance across several mobile vision tasks, including classification and detection. Though these models have fewer parameters, they have high latency as compared to convolutional neural network-based models. The main efficiency bottleneck in MobileViT is the multi-headed self-attention (MHA) in transformers, which requires O(k2) time complexity with respect to the number of tokens (or patches) k. Moreover, MHA requires costly operations (e.g., batch-wise matrix multiplication) for computing self-attention, impacting latency on resource-constrained devices. This paper introduces a separable self-attention method with linear complexity, i.e. O(k). A simple yet effective characteristic of the proposed method is that it uses element-wise operations for computing self-attention, making it a good choice for resource-constrained devices. The improved model, MobileViTV2, is state-of-the-art on several mobile vision tasks, including ImageNet object classification and MS-COCO object detection. With about three million parameters, MobileViTV2 achieves a top-1 accuracy of 75.6% on the ImageNet dataset, outperforming MobileViT by about 1% while running 3.2× faster on a mobile device.* + +Tips: + +- MobileViTV2 is more like a CNN than a Transformer model. It does not work on sequence data but on batches of images. Unlike ViT, there are no embeddings. The backbone model outputs a feature map. +- One can use [`MobileViTImageProcessor`] to prepare images for the model. Note that if you do your own preprocessing, the pretrained checkpoints expect images to be in BGR pixel order (not RGB). +- The available image classification checkpoints are pre-trained on [ImageNet-1k](https://huggingface.co/datasets/imagenet-1k) (also referred to as ILSVRC 2012, a collection of 1.3 million images and 1,000 classes). +- The segmentation model uses a [DeepLabV3](https://arxiv.org/abs/1706.05587) head. The available semantic segmentation checkpoints are pre-trained on [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/). + +This model was contributed by [shehan97](https://huggingface.co/shehan97). +The original code can be found [here](https://github.com/apple/ml-cvnets). + + +## MobileViTV2Config + +[[autodoc]] MobileViTV2Config + +## MobileViTV2Model + +[[autodoc]] MobileViTV2Model + - forward + +## MobileViTV2ForImageClassification + +[[autodoc]] MobileViTV2ForImageClassification + - forward + +## MobileViTV2ForSemanticSegmentation + +[[autodoc]] MobileViTV2ForSemanticSegmentation + - forward diff --git a/docs/source/en/tasks/image_classification.mdx b/docs/source/en/tasks/image_classification.mdx index 635b0b6358..d8f6f63ce0 100644 --- a/docs/source/en/tasks/image_classification.mdx +++ b/docs/source/en/tasks/image_classification.mdx @@ -30,7 +30,7 @@ The task illustrated in this tutorial is supported by the following model archit -[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [SwiftFormer](../model_doc/swiftformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn) +[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [SwiftFormer](../model_doc/swiftformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn) diff --git a/docs/source/en/tasks/semantic_segmentation.mdx b/docs/source/en/tasks/semantic_segmentation.mdx index ffaa251688..eac2507c9a 100644 --- a/docs/source/en/tasks/semantic_segmentation.mdx +++ b/docs/source/en/tasks/semantic_segmentation.mdx @@ -28,7 +28,7 @@ The task illustrated in this tutorial is supported by the following model archit -[BEiT](../model_doc/beit), [Data2VecVision](../model_doc/data2vec-vision), [DPT](../model_doc/dpt), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [SegFormer](../model_doc/segformer), [UPerNet](../model_doc/upernet) +[BEiT](../model_doc/beit), [Data2VecVision](../model_doc/data2vec-vision), [DPT](../model_doc/dpt), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [SegFormer](../model_doc/segformer), [UPerNet](../model_doc/upernet) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 14ba5ec410..edb15c037b 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -387,6 +387,7 @@ _import_structure = { "models.mobilenet_v1": ["MOBILENET_V1_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileNetV1Config"], "models.mobilenet_v2": ["MOBILENET_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileNetV2Config"], "models.mobilevit": ["MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileViTConfig"], + "models.mobilevitv2": ["MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileViTV2Config"], "models.mpnet": ["MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "MPNetConfig", "MPNetTokenizer"], "models.mt5": ["MT5Config"], "models.mvp": ["MvpConfig", "MvpTokenizer"], @@ -2073,6 +2074,15 @@ else: "MobileViTPreTrainedModel", ] ) + _import_structure["models.mobilevitv2"].extend( + [ + "MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST", + "MobileViTV2ForImageClassification", + "MobileViTV2ForSemanticSegmentation", + "MobileViTV2Model", + "MobileViTV2PreTrainedModel", + ] + ) _import_structure["models.mpnet"].extend( [ "MPNET_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -4183,6 +4193,7 @@ if TYPE_CHECKING: from .models.mobilenet_v1 import MOBILENET_V1_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileNetV1Config from .models.mobilenet_v2 import MOBILENET_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileNetV2Config from .models.mobilevit import MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileViTConfig + from .models.mobilevitv2 import MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileViTV2Config from .models.mpnet import MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP, MPNetConfig, MPNetTokenizer from .models.mt5 import MT5Config from .models.mvp import MvpConfig, MvpTokenizer @@ -5601,6 +5612,13 @@ if TYPE_CHECKING: MobileViTModel, MobileViTPreTrainedModel, ) + from .models.mobilevitv2 import ( + MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST, + MobileViTV2ForImageClassification, + MobileViTV2ForSemanticSegmentation, + MobileViTV2Model, + MobileViTV2PreTrainedModel, + ) from .models.mpnet import ( MPNET_PRETRAINED_MODEL_ARCHIVE_LIST, MPNetForMaskedLM, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 1aa2a049aa..26e4643728 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -130,6 +130,7 @@ from . import ( mobilenet_v1, mobilenet_v2, mobilevit, + mobilevitv2, mpnet, mt5, mvp, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 2c05989d99..72f8e26abf 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -133,6 +133,7 @@ CONFIG_MAPPING_NAMES = OrderedDict( ("mobilenet_v1", "MobileNetV1Config"), ("mobilenet_v2", "MobileNetV2Config"), ("mobilevit", "MobileViTConfig"), + ("mobilevitv2", "MobileViTV2Config"), ("mpnet", "MPNetConfig"), ("mt5", "MT5Config"), ("mvp", "MvpConfig"), @@ -319,6 +320,7 @@ CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict( ("mobilenet_v1", "MOBILENET_V1_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("mobilenet_v2", "MOBILENET_V2_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("mobilevit", "MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("mobilevitv2", "MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("mpnet", "MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("mvp", "MVP_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("nat", "NAT_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -520,6 +522,7 @@ MODEL_NAMES_MAPPING = OrderedDict( ("mobilenet_v1", "MobileNetV1"), ("mobilenet_v2", "MobileNetV2"), ("mobilevit", "MobileViT"), + ("mobilevitv2", "MobileViTV2"), ("mpnet", "MPNet"), ("mt5", "MT5"), ("mvp", "MVP"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index d3c6615527..cfa11f149c 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -77,6 +77,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict( ("mobilenet_v2", "MobileNetV2ImageProcessor"), ("mobilevit", "MobileViTImageProcessor"), ("mobilevit", "MobileViTImageProcessor"), + ("mobilevitv2", "MobileViTImageProcessor"), ("nat", "ViTImageProcessor"), ("oneformer", "OneFormerImageProcessor"), ("owlvit", "OwlViTImageProcessor"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index d6581f94d8..0a10eb96e9 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -131,6 +131,7 @@ MODEL_MAPPING_NAMES = OrderedDict( ("mobilenet_v1", "MobileNetV1Model"), ("mobilenet_v2", "MobileNetV2Model"), ("mobilevit", "MobileViTModel"), + ("mobilevitv2", "MobileViTV2Model"), ("mpnet", "MPNetModel"), ("mt5", "MT5Model"), ("mvp", "MvpModel"), @@ -457,6 +458,7 @@ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict( ("mobilenet_v1", "MobileNetV1ForImageClassification"), ("mobilenet_v2", "MobileNetV2ForImageClassification"), ("mobilevit", "MobileViTForImageClassification"), + ("mobilevitv2", "MobileViTV2ForImageClassification"), ("nat", "NatForImageClassification"), ( "perceiver", @@ -496,6 +498,7 @@ MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = OrderedDict( ("dpt", "DPTForSemanticSegmentation"), ("mobilenet_v2", "MobileNetV2ForSemanticSegmentation"), ("mobilevit", "MobileViTForSemanticSegmentation"), + ("mobilevitv2", "MobileViTV2ForSemanticSegmentation"), ("segformer", "SegformerForSemanticSegmentation"), ("upernet", "UperNetForSemanticSegmentation"), ] diff --git a/src/transformers/models/mobilevitv2/__init__.py b/src/transformers/models/mobilevitv2/__init__.py new file mode 100644 index 0000000000..043caf7b75 --- /dev/null +++ b/src/transformers/models/mobilevitv2/__init__.py @@ -0,0 +1,71 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import ( + OptionalDependencyNotAvailable, + _LazyModule, + is_torch_available, + is_vision_available, +) + + +_import_structure = { + "configuration_mobilevitv2": [ + "MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP", + "MobileViTV2Config", + "MobileViTV2OnnxConfig", + ], +} + + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_mobilevitv2"] = [ + "MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST", + "MobileViTV2ForImageClassification", + "MobileViTV2ForSemanticSegmentation", + "MobileViTV2Model", + "MobileViTV2PreTrainedModel", + ] + +if TYPE_CHECKING: + from .configuration_mobilevitv2 import ( + MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP, + MobileViTV2Config, + MobileViTV2OnnxConfig, + ) + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_mobilevitv2 import ( + MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST, + MobileViTV2ForImageClassification, + MobileViTV2ForSemanticSegmentation, + MobileViTV2Model, + MobileViTV2PreTrainedModel, + ) + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/mobilevitv2/configuration_mobilevitv2.py b/src/transformers/models/mobilevitv2/configuration_mobilevitv2.py new file mode 100644 index 0000000000..d98d88647e --- /dev/null +++ b/src/transformers/models/mobilevitv2/configuration_mobilevitv2.py @@ -0,0 +1,168 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" MobileViTV2 model configuration""" + +from collections import OrderedDict +from typing import Mapping + +from packaging import version + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + +MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "apple/mobilevitv2-1.0": "https://huggingface.co/apple/mobilevitv2-1.0/resolve/main/config.json", +} + + +class MobileViTV2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`MobileViTV2Model`]. It is used to instantiate a + MobileViTV2 model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the MobileViTV2 + [apple/mobilevitv2-1.0](https://huggingface.co/apple/mobilevitv2-1.0) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + image_size (`int`, *optional*, defaults to 256): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 2): + The size (resolution) of each patch. + expand_ratio (`float`, *optional*, defaults to 2.0): + Expansion factor for the MobileNetv2 layers. + hidden_act (`str` or `function`, *optional*, defaults to `"swish"`): + The non-linear activation function (function or string) in the Transformer encoder and convolution layers. + conv_kernel_size (`int`, *optional*, defaults to 3): + The size of the convolutional kernel in the MobileViTV2 layer. + output_stride (`int`, `optional`, defaults to 32): + The ratio of the spatial resolution of the output to the resolution of the input image. + classifier_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for attached classifiers. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-5): + The epsilon used by the layer normalization layers. + aspp_out_channels (`int`, `optional`, defaults to 512): + Number of output channels used in the ASPP layer for semantic segmentation. + atrous_rates (`List[int]`, *optional*, defaults to `[6, 12, 18]`): + Dilation (atrous) factors used in the ASPP layer for semantic segmentation. + aspp_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the ASPP layer for semantic segmentation. + semantic_loss_ignore_index (`int`, *optional*, defaults to 255): + The index that is ignored by the loss function of the semantic segmentation model. + n_attn_blocks (`List[int]`, *optional*, defaults to `[2, 4, 3]`): + The number of attention blocks in each MobileViTV2Layer + base_attn_unit_dims (`List[int]`, *optional*, defaults to `[128, 192, 256]`): + The base multiplier for dimensions of attention blocks in each MobileViTV2Layer + width_multiplier (`float`, *optional*, defaults to 1.0) + The width multiplier for MobileViTV2. + ffn_multiplier (`int`, *optional*, defaults to 2) + The FFN multiplier for MobileViTV2. + attn_dropout (`float`, *optional*, defaults to 0.0) + The dropout in the attention layer. + ffn_dropout (`float`, *optional*, defaults to 0.0) + The dropout between FFN layers. + + Example: + + ```python + >>> from transformers import MobileViTV2Config, MobileViTV2Model + + >>> # Initializing a mobilevitv2-small style configuration + >>> configuration = MobileViTV2Config() + + >>> # Initializing a model from the mobilevitv2-small style configuration + >>> model = MobileViTV2Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "mobilevitv2" + + def __init__( + self, + num_channels=3, + image_size=256, + patch_size=2, + expand_ratio=2.0, + hidden_act="swish", + conv_kernel_size=3, + output_stride=32, + classifier_dropout_prob=0.1, + initializer_range=0.02, + layer_norm_eps=1e-5, + aspp_out_channels=512, + atrous_rates=[6, 12, 18], + aspp_dropout_prob=0.1, + semantic_loss_ignore_index=255, + n_attn_blocks=[2, 4, 3], + base_attn_unit_dims=[128, 192, 256], + width_multiplier=1.0, + ffn_multiplier=2, + attn_dropout=0.0, + ffn_dropout=0.0, + **kwargs, + ): + super().__init__(**kwargs) + + self.num_channels = num_channels + self.image_size = image_size + self.patch_size = patch_size + self.expand_ratio = expand_ratio + self.hidden_act = hidden_act + self.conv_kernel_size = conv_kernel_size + self.output_stride = output_stride + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.n_attn_blocks = n_attn_blocks + self.base_attn_unit_dims = base_attn_unit_dims + self.width_multiplier = width_multiplier + self.ffn_multiplier = ffn_multiplier + self.ffn_dropout = ffn_dropout + self.attn_dropout = attn_dropout + self.classifier_dropout_prob = classifier_dropout_prob + + # decode head attributes for semantic segmentation + self.aspp_out_channels = aspp_out_channels + self.atrous_rates = atrous_rates + self.aspp_dropout_prob = aspp_dropout_prob + self.semantic_loss_ignore_index = semantic_loss_ignore_index + + +class MobileViTV2OnnxConfig(OnnxConfig): + torch_onnx_minimum_version = version.parse("1.11") + + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + return OrderedDict([("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"})]) + + @property + def outputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task == "image-classification": + return OrderedDict([("logits", {0: "batch"})]) + else: + return OrderedDict([("last_hidden_state", {0: "batch"}), ("pooler_output", {0: "batch"})]) + + @property + def atol_for_validation(self) -> float: + return 1e-4 diff --git a/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py b/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py new file mode 100644 index 0000000000..16e9029d72 --- /dev/null +++ b/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py @@ -0,0 +1,326 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert MobileViTV2 checkpoints from the ml-cvnets library.""" + + +import argparse +import collections +import json +from pathlib import Path + +import requests +import torch +import yaml +from huggingface_hub import hf_hub_download +from PIL import Image + +from transformers import ( + MobileViTImageProcessor, + MobileViTV2Config, + MobileViTV2ForImageClassification, + MobileViTV2ForSemanticSegmentation, +) +from transformers.utils import logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + + +def load_orig_config_file(orig_cfg_file): + print("Loading config file...") + + def flatten_yaml_as_dict(d, parent_key="", sep="."): + items = [] + for k, v in d.items(): + new_key = parent_key + sep + k if parent_key else k + if isinstance(v, collections.abc.MutableMapping): + items.extend(flatten_yaml_as_dict(v, new_key, sep=sep).items()) + else: + items.append((new_key, v)) + return dict(items) + + config = argparse.Namespace() + with open(orig_cfg_file, "r") as yaml_file: + try: + cfg = yaml.load(yaml_file, Loader=yaml.FullLoader) + + flat_cfg = flatten_yaml_as_dict(cfg) + for k, v in flat_cfg.items(): + setattr(config, k, v) + except yaml.YAMLError as exc: + logger.error("Error while loading config file: {}. Error message: {}".format(orig_cfg_file, str(exc))) + return config + + +def get_mobilevitv2_config(task_name, orig_cfg_file): + config = MobileViTV2Config() + + is_segmentation_model = False + + # dataset + if task_name.startswith("imagenet1k_"): + config.num_labels = 1000 + if int(task_name.strip().split("_")[-1]) == 384: + config.image_size = 384 + else: + config.image_size = 256 + filename = "imagenet-1k-id2label.json" + elif task_name.startswith("imagenet21k_to_1k_"): + config.num_labels = 21000 + if int(task_name.strip().split("_")[-1]) == 384: + config.image_size = 384 + else: + config.image_size = 256 + filename = "imagenet-22k-id2label.json" + elif task_name.startswith("ade20k_"): + config.num_labels = 151 + config.image_size = 512 + filename = "ade20k-id2label.json" + is_segmentation_model = True + elif task_name.startswith("voc_"): + config.num_labels = 21 + config.image_size = 512 + filename = "pascal-voc-id2label.json" + is_segmentation_model = True + + # orig_config + orig_config = load_orig_config_file(orig_cfg_file) + assert getattr(orig_config, "model.classification.name", -1) == "mobilevit_v2", "Invalid model" + config.width_multiplier = getattr(orig_config, "model.classification.mitv2.width_multiplier", 1.0) + assert ( + getattr(orig_config, "model.classification.mitv2.attn_norm_layer", -1) == "layer_norm_2d" + ), "Norm layers other than layer_norm_2d is not supported" + config.hidden_act = getattr(orig_config, "model.classification.activation.name", "swish") + # config.image_size == getattr(orig_config, 'sampler.bs.crop_size_width', 256) + + if is_segmentation_model: + config.output_stride = getattr(orig_config, "model.segmentation.output_stride", 16) + if "_deeplabv3" in task_name: + config.atrous_rates = getattr(orig_config, "model.segmentation.deeplabv3.aspp_rates", [12, 24, 36]) + config.aspp_out_channels = getattr(orig_config, "model.segmentation.deeplabv3.aspp_out_channels", 512) + config.aspp_dropout_prob = getattr(orig_config, "model.segmentation.deeplabv3.aspp_dropout", 0.1) + + # id2label + repo_id = "huggingface/label-files" + id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) + id2label = {int(k): v for k, v in id2label.items()} + config.id2label = id2label + config.label2id = {v: k for k, v in id2label.items()} + + return config + + +def rename_key(dct, old, new): + val = dct.pop(old) + dct[new] = val + + +def create_rename_keys(state_dict, base_model=False): + if base_model: + model_prefix = "" + else: + model_prefix = "mobilevitv2." + + rename_keys = [] + for k in state_dict.keys(): + if k[:8] == "encoder.": + k_new = k[8:] + else: + k_new = k + + if ".block." in k: + k_new = k_new.replace(".block.", ".") + if ".conv." in k: + k_new = k_new.replace(".conv.", ".convolution.") + if ".norm." in k: + k_new = k_new.replace(".norm.", ".normalization.") + + if "conv_1." in k: + k_new = k_new.replace("conv_1.", f"{model_prefix}conv_stem.") + for i in [1, 2]: + if f"layer_{i}." in k: + k_new = k_new.replace(f"layer_{i}.", f"{model_prefix}encoder.layer.{i-1}.layer.") + if ".exp_1x1." in k: + k_new = k_new.replace(".exp_1x1.", ".expand_1x1.") + if ".red_1x1." in k: + k_new = k_new.replace(".red_1x1.", ".reduce_1x1.") + + for i in [3, 4, 5]: + if f"layer_{i}.0." in k: + k_new = k_new.replace(f"layer_{i}.0.", f"{model_prefix}encoder.layer.{i-1}.downsampling_layer.") + if f"layer_{i}.1.local_rep.0." in k: + k_new = k_new.replace(f"layer_{i}.1.local_rep.0.", f"{model_prefix}encoder.layer.{i-1}.conv_kxk.") + if f"layer_{i}.1.local_rep.1." in k: + k_new = k_new.replace(f"layer_{i}.1.local_rep.1.", f"{model_prefix}encoder.layer.{i-1}.conv_1x1.") + + for i in [3, 4, 5]: + if i == 3: + j_in = [0, 1] + elif i == 4: + j_in = [0, 1, 2, 3] + elif i == 5: + j_in = [0, 1, 2] + + for j in j_in: + if f"layer_{i}.1.global_rep.{j}." in k: + k_new = k_new.replace( + f"layer_{i}.1.global_rep.{j}.", f"{model_prefix}encoder.layer.{i-1}.transformer.layer.{j}." + ) + if f"layer_{i}.1.global_rep.{j+1}." in k: + k_new = k_new.replace( + f"layer_{i}.1.global_rep.{j+1}.", f"{model_prefix}encoder.layer.{i-1}.layernorm." + ) + + if f"layer_{i}.1.conv_proj." in k: + k_new = k_new.replace(f"layer_{i}.1.conv_proj.", f"{model_prefix}encoder.layer.{i-1}.conv_projection.") + + if "pre_norm_attn.0." in k: + k_new = k_new.replace("pre_norm_attn.0.", "layernorm_before.") + if "pre_norm_attn.1." in k: + k_new = k_new.replace("pre_norm_attn.1.", "attention.") + if "pre_norm_ffn.0." in k: + k_new = k_new.replace("pre_norm_ffn.0.", "layernorm_after.") + if "pre_norm_ffn.1." in k: + k_new = k_new.replace("pre_norm_ffn.1.", "ffn.conv1.") + if "pre_norm_ffn.3." in k: + k_new = k_new.replace("pre_norm_ffn.3.", "ffn.conv2.") + + if "classifier.1." in k: + k_new = k_new.replace("classifier.1.", "classifier.") + + if "seg_head." in k: + k_new = k_new.replace("seg_head.", "segmentation_head.") + if ".aspp_layer." in k: + k_new = k_new.replace(".aspp_layer.", ".") + if ".aspp_pool." in k: + k_new = k_new.replace(".aspp_pool.", ".") + + rename_keys.append((k, k_new)) + return rename_keys + + +def remove_unused_keys(state_dict): + """remove unused keys (e.g.: seg_head.aux_head)""" + keys_to_ignore = [] + for k in state_dict.keys(): + if k.startswith("seg_head.aux_head."): + keys_to_ignore.append(k) + for k in keys_to_ignore: + state_dict.pop(k, None) + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + # url = "https://cdn.britannica.com/86/141086-050-9D7C75EE/Gulfstream-G450-business-jet-passengers.jpg" + im = Image.open(requests.get(url, stream=True).raw) + return im + + +@torch.no_grad() +def convert_mobilevitv2_checkpoint(task_name, checkpoint_path, orig_config_path, pytorch_dump_folder_path): + """ + Copy/paste/tweak model's weights to our MobileViTV2 structure. + """ + config = get_mobilevitv2_config(task_name, orig_config_path) + + # load original state_dict + checkpoint = torch.load(checkpoint_path, map_location="cpu") + + # load huggingface model + if task_name.startswith("ade20k_") or task_name.startswith("voc_"): + model = MobileViTV2ForSemanticSegmentation(config).eval() + base_model = False + else: + model = MobileViTV2ForImageClassification(config).eval() + base_model = False + + # remove and rename some keys of load the original model + state_dict = checkpoint + remove_unused_keys(state_dict) + rename_keys = create_rename_keys(state_dict, base_model=base_model) + for rename_key_src, rename_key_dest in rename_keys: + rename_key(state_dict, rename_key_src, rename_key_dest) + + # load modified state_dict + model.load_state_dict(state_dict) + + # Check outputs on an image, prepared by MobileViTImageProcessor + feature_extractor = MobileViTImageProcessor(crop_size=config.image_size, size=config.image_size + 32) + encoding = feature_extractor(images=prepare_img(), return_tensors="pt") + outputs = model(**encoding) + + # verify classification model + if task_name.startswith("imagenet"): + logits = outputs.logits + predicted_class_idx = logits.argmax(-1).item() + print("Predicted class:", model.config.id2label[predicted_class_idx]) + if task_name.startswith("imagenet1k_256") and config.width_multiplier == 1.0: + # expected_logits for base variant + expected_logits = torch.tensor([-1.6336e00, -7.3204e-02, -5.1883e-01]) + assert torch.allclose(logits[0, :3], expected_logits, atol=1e-4) + + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + print(f"Saving model {task_name} to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path) + print(f"Saving feature extractor to {pytorch_dump_folder_path}") + feature_extractor.save_pretrained(pytorch_dump_folder_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--task", + default="imagenet1k_256", + type=str, + help=( + "Name of the task for which the MobileViTV2 model you'd like to convert is trained on . " + """ + Classification (ImageNet-1k) + - MobileViTV2 (256x256) : imagenet1k_256 + - MobileViTV2 (Trained on 256x256 and Finetuned on 384x384) : imagenet1k_384 + - MobileViTV2 (Trained on ImageNet-21k and Finetuned on ImageNet-1k 256x256) : + imagenet21k_to_1k_256 + - MobileViTV2 (Trained on ImageNet-21k, Finetuned on ImageNet-1k 256x256, and Finetuned on + ImageNet-1k 384x384) : imagenet21k_to_1k_384 + Segmentation + - ADE20K Dataset : ade20k_deeplabv3 + - Pascal VOC 2012 Dataset: voc_deeplabv3 + """ + ), + choices=[ + "imagenet1k_256", + "imagenet1k_384", + "imagenet21k_to_1k_256", + "imagenet21k_to_1k_384", + "ade20k_deeplabv3", + "voc_deeplabv3", + ], + ) + + parser.add_argument( + "--orig_checkpoint_path", required=True, type=str, help="Path to the original state dict (.pt file)." + ) + parser.add_argument("--orig_config_path", required=True, type=str, help="Path to the original config file.") + parser.add_argument( + "--pytorch_dump_folder_path", required=True, type=str, help="Path to the output PyTorch model directory." + ) + + args = parser.parse_args() + convert_mobilevitv2_checkpoint( + args.task, args.orig_checkpoint_path, args.orig_config_path, args.pytorch_dump_folder_path + ) diff --git a/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py b/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py new file mode 100644 index 0000000000..b8c071a74f --- /dev/null +++ b/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py @@ -0,0 +1,1044 @@ +# coding=utf-8 +# Copyright 2023 Apple Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Original license: https://github.com/apple/ml-cvnets/blob/main/LICENSE +""" PyTorch MobileViTV2 model.""" + + +from typing import Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...modeling_outputs import ( + BaseModelOutputWithNoAttention, + BaseModelOutputWithPoolingAndNoAttention, + ImageClassifierOutputWithNoAttention, + SemanticSegmenterOutput, +) +from ...modeling_utils import PreTrainedModel +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_mobilevitv2 import MobileViTV2Config + + +logger = logging.get_logger(__name__) + + +# General docstring +_CONFIG_FOR_DOC = "MobileViTV2Config" + +# Base docstring +_CHECKPOINT_FOR_DOC = "apple/mobilevitv2-1.0-imagenet1k-256" +_EXPECTED_OUTPUT_SHAPE = [1, 512, 8, 8] + +# Image classification docstring +_IMAGE_CLASS_CHECKPOINT = "apple/mobilevitv2-1.0-imagenet1k-256" +_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat" + + +MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "apple/mobilevitv2-1.0-imagenet1k-256" + # See all MobileViTV2 models at https://huggingface.co/models?filter=mobilevitv2 +] + + +# Copied from transformers.models.mobilevit.modeling_mobilevit.make_divisible +def make_divisible(value: int, divisor: int = 8, min_value: Optional[int] = None) -> int: + """ + Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the + original TensorFlow repo. It can be seen here: + https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py + """ + if min_value is None: + min_value = divisor + new_value = max(min_value, int(value + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_value < 0.9 * value: + new_value += divisor + return int(new_value) + + +def clip(value: float, min_val: float = float("-inf"), max_val: float = float("inf")) -> float: + return max(min_val, min(max_val, value)) + + +# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTConvLayer with MobileViT->MobileViTV2 +class MobileViTV2ConvLayer(nn.Module): + def __init__( + self, + config: MobileViTV2Config, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + groups: int = 1, + bias: bool = False, + dilation: int = 1, + use_normalization: bool = True, + use_activation: Union[bool, str] = True, + ) -> None: + super().__init__() + padding = int((kernel_size - 1) / 2) * dilation + + if in_channels % groups != 0: + raise ValueError(f"Input channels ({in_channels}) are not divisible by {groups} groups.") + if out_channels % groups != 0: + raise ValueError(f"Output channels ({out_channels}) are not divisible by {groups} groups.") + + self.convolution = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + padding_mode="zeros", + ) + + if use_normalization: + self.normalization = nn.BatchNorm2d( + num_features=out_channels, + eps=1e-5, + momentum=0.1, + affine=True, + track_running_stats=True, + ) + else: + self.normalization = None + + if use_activation: + if isinstance(use_activation, str): + self.activation = ACT2FN[use_activation] + elif isinstance(config.hidden_act, str): + self.activation = ACT2FN[config.hidden_act] + else: + self.activation = config.hidden_act + else: + self.activation = None + + def forward(self, features: torch.Tensor) -> torch.Tensor: + features = self.convolution(features) + if self.normalization is not None: + features = self.normalization(features) + if self.activation is not None: + features = self.activation(features) + return features + + +# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTInvertedResidual with MobileViT->MobileViTV2 +class MobileViTV2InvertedResidual(nn.Module): + """ + Inverted residual block (MobileNetv2): https://arxiv.org/abs/1801.04381 + """ + + def __init__( + self, config: MobileViTV2Config, in_channels: int, out_channels: int, stride: int, dilation: int = 1 + ) -> None: + super().__init__() + expanded_channels = make_divisible(int(round(in_channels * config.expand_ratio)), 8) + + if stride not in [1, 2]: + raise ValueError(f"Invalid stride {stride}.") + + self.use_residual = (stride == 1) and (in_channels == out_channels) + + self.expand_1x1 = MobileViTV2ConvLayer( + config, in_channels=in_channels, out_channels=expanded_channels, kernel_size=1 + ) + + self.conv_3x3 = MobileViTV2ConvLayer( + config, + in_channels=expanded_channels, + out_channels=expanded_channels, + kernel_size=3, + stride=stride, + groups=expanded_channels, + dilation=dilation, + ) + + self.reduce_1x1 = MobileViTV2ConvLayer( + config, + in_channels=expanded_channels, + out_channels=out_channels, + kernel_size=1, + use_activation=False, + ) + + def forward(self, features: torch.Tensor) -> torch.Tensor: + residual = features + + features = self.expand_1x1(features) + features = self.conv_3x3(features) + features = self.reduce_1x1(features) + + return residual + features if self.use_residual else features + + +# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTMobileNetLayer with MobileViT->MobileViTV2 +class MobileViTV2MobileNetLayer(nn.Module): + def __init__( + self, config: MobileViTV2Config, in_channels: int, out_channels: int, stride: int = 1, num_stages: int = 1 + ) -> None: + super().__init__() + + self.layer = nn.ModuleList() + for i in range(num_stages): + layer = MobileViTV2InvertedResidual( + config, + in_channels=in_channels, + out_channels=out_channels, + stride=stride if i == 0 else 1, + ) + self.layer.append(layer) + in_channels = out_channels + + def forward(self, features: torch.Tensor) -> torch.Tensor: + for layer_module in self.layer: + features = layer_module(features) + return features + + +class MobileViTV2LinearSelfAttention(nn.Module): + """ + This layer applies a self-attention with linear complexity, as described in MobileViTV2 paper: + https://arxiv.org/abs/2206.02680 + + Args: + config (`MobileVitv2Config`): + Model configuration object + embed_dim (`int`): + `input_channels` from an expected input of size :math:`(batch_size, input_channels, height, width)` + """ + + def __init__(self, config: MobileViTV2Config, embed_dim: int) -> None: + super().__init__() + + self.qkv_proj = MobileViTV2ConvLayer( + config=config, + in_channels=embed_dim, + out_channels=1 + (2 * embed_dim), + bias=True, + kernel_size=1, + use_normalization=False, + use_activation=False, + ) + + self.attn_dropout = nn.Dropout(p=config.attn_dropout) + self.out_proj = MobileViTV2ConvLayer( + config=config, + in_channels=embed_dim, + out_channels=embed_dim, + bias=True, + kernel_size=1, + use_normalization=False, + use_activation=False, + ) + self.embed_dim = embed_dim + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # (batch_size, embed_dim, num_pixels_in_patch, num_patches) --> (batch_size, 1+2*embed_dim, num_pixels_in_patch, num_patches) + qkv = self.qkv_proj(hidden_states) + + # Project hidden_states into query, key and value + # Query --> [batch_size, 1, num_pixels_in_patch, num_patches] + # value, key --> [batch_size, embed_dim, num_pixels_in_patch, num_patches] + query, key, value = torch.split(qkv, split_size_or_sections=[1, self.embed_dim, self.embed_dim], dim=1) + + # apply softmax along num_patches dimension + context_scores = torch.nn.functional.softmax(query, dim=-1) + context_scores = self.attn_dropout(context_scores) + + # Compute context vector + # [batch_size, embed_dim, num_pixels_in_patch, num_patches] x [batch_size, 1, num_pixels_in_patch, num_patches] -> [batch_size, embed_dim, num_pixels_in_patch, num_patches] + context_vector = key * context_scores + # [batch_size, embed_dim, num_pixels_in_patch, num_patches] --> [batch_size, embed_dim, num_pixels_in_patch, 1] + context_vector = torch.sum(context_vector, dim=-1, keepdim=True) + + # combine context vector with values + # [batch_size, embed_dim, num_pixels_in_patch, num_patches] * [batch_size, embed_dim, num_pixels_in_patch, 1] --> [batch_size, embed_dim, num_pixels_in_patch, num_patches] + out = torch.nn.functional.relu(value) * context_vector.expand_as(value) + out = self.out_proj(out) + return out + + +class MobileViTV2FFN(nn.Module): + def __init__( + self, + config: MobileViTV2Config, + embed_dim: int, + ffn_latent_dim: int, + ffn_dropout: float = 0.0, + ) -> None: + super().__init__() + self.conv1 = MobileViTV2ConvLayer( + config=config, + in_channels=embed_dim, + out_channels=ffn_latent_dim, + kernel_size=1, + stride=1, + bias=True, + use_normalization=False, + use_activation=True, + ) + self.dropout1 = nn.Dropout(ffn_dropout) + + self.conv2 = MobileViTV2ConvLayer( + config=config, + in_channels=ffn_latent_dim, + out_channels=embed_dim, + kernel_size=1, + stride=1, + bias=True, + use_normalization=False, + use_activation=False, + ) + self.dropout2 = nn.Dropout(ffn_dropout) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.conv1(hidden_states) + hidden_states = self.dropout1(hidden_states) + hidden_states = self.conv2(hidden_states) + hidden_states = self.dropout2(hidden_states) + return hidden_states + + +class MobileViTV2TransformerLayer(nn.Module): + def __init__( + self, + config: MobileViTV2Config, + embed_dim: int, + ffn_latent_dim: int, + dropout: float = 0.0, + ) -> None: + super().__init__() + self.layernorm_before = nn.GroupNorm(num_groups=1, num_channels=embed_dim, eps=config.layer_norm_eps) + self.attention = MobileViTV2LinearSelfAttention(config, embed_dim) + self.dropout1 = nn.Dropout(p=dropout) + self.layernorm_after = nn.GroupNorm(num_groups=1, num_channels=embed_dim, eps=config.layer_norm_eps) + self.ffn = MobileViTV2FFN(config, embed_dim, ffn_latent_dim, config.ffn_dropout) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + layernorm_1_out = self.layernorm_before(hidden_states) + attention_output = self.attention(layernorm_1_out) + hidden_states = attention_output + hidden_states + + layer_output = self.layernorm_after(hidden_states) + layer_output = self.ffn(layer_output) + + layer_output = layer_output + hidden_states + return layer_output + + +class MobileViTV2Transformer(nn.Module): + def __init__(self, config: MobileViTV2Config, n_layers: int, d_model: int) -> None: + super().__init__() + + ffn_multiplier = config.ffn_multiplier + + ffn_dims = [ffn_multiplier * d_model] * n_layers + + # ensure that dims are multiple of 16 + ffn_dims = [int((d // 16) * 16) for d in ffn_dims] + + self.layer = nn.ModuleList() + for block_idx in range(n_layers): + transformer_layer = MobileViTV2TransformerLayer( + config, embed_dim=d_model, ffn_latent_dim=ffn_dims[block_idx] + ) + self.layer.append(transformer_layer) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + for layer_module in self.layer: + hidden_states = layer_module(hidden_states) + return hidden_states + + +class MobileViTV2Layer(nn.Module): + """ + MobileViTV2 layer: https://arxiv.org/abs/2206.02680 + """ + + def __init__( + self, + config: MobileViTV2Config, + in_channels: int, + out_channels: int, + attn_unit_dim: int, + n_attn_blocks: int = 2, + dilation: int = 1, + stride: int = 2, + ) -> None: + super().__init__() + self.patch_width = config.patch_size + self.patch_height = config.patch_size + + cnn_out_dim = attn_unit_dim + + if stride == 2: + self.downsampling_layer = MobileViTV2InvertedResidual( + config, + in_channels=in_channels, + out_channels=out_channels, + stride=stride if dilation == 1 else 1, + dilation=dilation // 2 if dilation > 1 else 1, + ) + in_channels = out_channels + else: + self.downsampling_layer = None + + # Local representations + self.conv_kxk = MobileViTV2ConvLayer( + config, + in_channels=in_channels, + out_channels=in_channels, + kernel_size=config.conv_kernel_size, + groups=in_channels, + ) + self.conv_1x1 = MobileViTV2ConvLayer( + config, + in_channels=in_channels, + out_channels=cnn_out_dim, + kernel_size=1, + use_normalization=False, + use_activation=False, + ) + + # Global representations + self.transformer = MobileViTV2Transformer(config, d_model=attn_unit_dim, n_layers=n_attn_blocks) + + # self.layernorm = MobileViTV2LayerNorm2D(attn_unit_dim, eps=config.layer_norm_eps) + self.layernorm = nn.GroupNorm(num_groups=1, num_channels=attn_unit_dim, eps=config.layer_norm_eps) + + # Fusion + self.conv_projection = MobileViTV2ConvLayer( + config, + in_channels=cnn_out_dim, + out_channels=in_channels, + kernel_size=1, + use_normalization=True, + use_activation=False, + ) + + def unfolding(self, feature_map: torch.Tensor) -> Tuple[torch.Tensor, Tuple[int, int]]: + batch_size, in_channels, img_height, img_width = feature_map.shape + patches = nn.functional.unfold( + feature_map, + kernel_size=(self.patch_height, self.patch_width), + stride=(self.patch_height, self.patch_width), + ) + patches = patches.reshape(batch_size, in_channels, self.patch_height * self.patch_width, -1) + + return patches, (img_height, img_width) + + def folding(self, patches: torch.Tensor, output_size: Tuple[int, int]) -> torch.Tensor: + batch_size, in_dim, patch_size, n_patches = patches.shape + patches = patches.reshape(batch_size, in_dim * patch_size, n_patches) + + feature_map = nn.functional.fold( + patches, + output_size=output_size, + kernel_size=(self.patch_height, self.patch_width), + stride=(self.patch_height, self.patch_width), + ) + + return feature_map + + def forward(self, features: torch.Tensor) -> torch.Tensor: + # reduce spatial dimensions if needed + if self.downsampling_layer: + features = self.downsampling_layer(features) + + # local representation + features = self.conv_kxk(features) + features = self.conv_1x1(features) + + # convert feature map to patches + patches, output_size = self.unfolding(features) + + # learn global representations + patches = self.transformer(patches) + patches = self.layernorm(patches) + + # convert patches back to feature maps + # [batch_size, patch_height, patch_width, input_dim] --> [batch_size, input_dim, patch_height, patch_width] + features = self.folding(patches, output_size) + + features = self.conv_projection(features) + return features + + +class MobileViTV2Encoder(nn.Module): + def __init__(self, config: MobileViTV2Config) -> None: + super().__init__() + self.config = config + + self.layer = nn.ModuleList() + self.gradient_checkpointing = False + + # segmentation architectures like DeepLab and PSPNet modify the strides + # of the classification backbones + dilate_layer_4 = dilate_layer_5 = False + if config.output_stride == 8: + dilate_layer_4 = True + dilate_layer_5 = True + elif config.output_stride == 16: + dilate_layer_5 = True + + dilation = 1 + + layer_0_dim = make_divisible( + clip(value=32 * config.width_multiplier, min_val=16, max_val=64), divisor=8, min_value=16 + ) + + layer_1_dim = make_divisible(64 * config.width_multiplier, divisor=16) + layer_2_dim = make_divisible(128 * config.width_multiplier, divisor=8) + layer_3_dim = make_divisible(256 * config.width_multiplier, divisor=8) + layer_4_dim = make_divisible(384 * config.width_multiplier, divisor=8) + layer_5_dim = make_divisible(512 * config.width_multiplier, divisor=8) + + layer_1 = MobileViTV2MobileNetLayer( + config, + in_channels=layer_0_dim, + out_channels=layer_1_dim, + stride=1, + num_stages=1, + ) + self.layer.append(layer_1) + + layer_2 = MobileViTV2MobileNetLayer( + config, + in_channels=layer_1_dim, + out_channels=layer_2_dim, + stride=2, + num_stages=2, + ) + self.layer.append(layer_2) + + layer_3 = MobileViTV2Layer( + config, + in_channels=layer_2_dim, + out_channels=layer_3_dim, + attn_unit_dim=make_divisible(config.base_attn_unit_dims[0] * config.width_multiplier, divisor=8), + n_attn_blocks=config.n_attn_blocks[0], + ) + self.layer.append(layer_3) + + if dilate_layer_4: + dilation *= 2 + + layer_4 = MobileViTV2Layer( + config, + in_channels=layer_3_dim, + out_channels=layer_4_dim, + attn_unit_dim=make_divisible(config.base_attn_unit_dims[1] * config.width_multiplier, divisor=8), + n_attn_blocks=config.n_attn_blocks[1], + dilation=dilation, + ) + self.layer.append(layer_4) + + if dilate_layer_5: + dilation *= 2 + + layer_5 = MobileViTV2Layer( + config, + in_channels=layer_4_dim, + out_channels=layer_5_dim, + attn_unit_dim=make_divisible(config.base_attn_unit_dims[2] * config.width_multiplier, divisor=8), + n_attn_blocks=config.n_attn_blocks[2], + dilation=dilation, + ) + self.layer.append(layer_5) + + def forward( + self, + hidden_states: torch.Tensor, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, BaseModelOutputWithNoAttention]: + all_hidden_states = () if output_hidden_states else None + + for i, layer_module in enumerate(self.layer): + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs) + + return custom_forward + + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + ) + else: + hidden_states = layer_module(hidden_states) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states] if v is not None) + + return BaseModelOutputWithNoAttention(last_hidden_state=hidden_states, hidden_states=all_hidden_states) + + +# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTPreTrainedModel with MobileViT->MobileViTV2,mobilevit->mobilevitv2 +class MobileViTV2PreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = MobileViTV2Config + base_model_prefix = "mobilevitv2" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + + def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, MobileViTV2Encoder): + module.gradient_checkpointing = value + + +MOBILEVITV2_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it + as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`MobileViTV2Config`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +MOBILEVITV2_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See + [`MobileViTImageProcessor.__call__`] for details. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare MobileViTV2 model outputting raw hidden-states without any specific head on top.", + MOBILEVITV2_START_DOCSTRING, +) +class MobileViTV2Model(MobileViTV2PreTrainedModel): + def __init__(self, config: MobileViTV2Config, expand_output: bool = True): + super().__init__(config) + self.config = config + self.expand_output = expand_output + + layer_0_dim = make_divisible( + clip(value=32 * config.width_multiplier, min_val=16, max_val=64), divisor=8, min_value=16 + ) + + self.conv_stem = MobileViTV2ConvLayer( + config, + in_channels=config.num_channels, + out_channels=layer_0_dim, + kernel_size=3, + stride=2, + use_normalization=True, + use_activation=True, + ) + self.encoder = MobileViTV2Encoder(config) + + # Initialize weights and apply final processing + self.post_init() + + def _prune_heads(self, heads_to_prune): + """Prunes heads of the model. + heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel + """ + for layer_index, heads in heads_to_prune.items(): + mobilevitv2_layer = self.encoder.layer[layer_index] + if isinstance(mobilevitv2_layer, MobileViTV2Layer): + for transformer_layer in mobilevitv2_layer.transformer.layer: + transformer_layer.attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(MOBILEVITV2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPoolingAndNoAttention, + config_class=_CONFIG_FOR_DOC, + modality="vision", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutputWithPoolingAndNoAttention]: + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + embedding_output = self.conv_stem(pixel_values) + + encoder_outputs = self.encoder( + embedding_output, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if self.expand_output: + last_hidden_state = encoder_outputs[0] + + # global average pooling: (batch_size, channels, height, width) -> (batch_size, channels) + pooled_output = torch.mean(last_hidden_state, dim=[-2, -1], keepdim=False) + else: + last_hidden_state = encoder_outputs[0] + pooled_output = None + + if not return_dict: + output = (last_hidden_state, pooled_output) if pooled_output is not None else (last_hidden_state,) + return output + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndNoAttention( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + ) + + +@add_start_docstrings( + """ + MobileViTV2 model with an image classification head on top (a linear layer on top of the pooled features), e.g. for + ImageNet. + """, + MOBILEVITV2_START_DOCSTRING, +) +class MobileViTV2ForImageClassification(MobileViTV2PreTrainedModel): + def __init__(self, config: MobileViTV2Config) -> None: + super().__init__(config) + + self.num_labels = config.num_labels + self.mobilevitv2 = MobileViTV2Model(config) + + out_channels = make_divisible(512 * config.width_multiplier, divisor=8) # layer 5 output dimension + # Classifier head + self.classifier = ( + nn.Linear(in_features=out_channels, out_features=config.num_labels) + if config.num_labels > 0 + else nn.Identity() + ) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(MOBILEVITV2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=ImageClassifierOutputWithNoAttention, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None, + labels: Optional[torch.Tensor] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, ImageClassifierOutputWithNoAttention]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.mobilevitv2(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) + + pooled_output = outputs.pooler_output if return_dict else outputs[1] + + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return ImageClassifierOutputWithNoAttention( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + ) + + +# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTASPPPooling with MobileViT->MobileViTV2 +class MobileViTV2ASPPPooling(nn.Module): + def __init__(self, config: MobileViTV2Config, in_channels: int, out_channels: int) -> None: + super().__init__() + + self.global_pool = nn.AdaptiveAvgPool2d(output_size=1) + + self.conv_1x1 = MobileViTV2ConvLayer( + config, + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + use_normalization=True, + use_activation="relu", + ) + + def forward(self, features: torch.Tensor) -> torch.Tensor: + spatial_size = features.shape[-2:] + features = self.global_pool(features) + features = self.conv_1x1(features) + features = nn.functional.interpolate(features, size=spatial_size, mode="bilinear", align_corners=False) + return features + + +class MobileViTV2ASPP(nn.Module): + """ + ASPP module defined in DeepLab papers: https://arxiv.org/abs/1606.00915, https://arxiv.org/abs/1706.05587 + """ + + def __init__(self, config: MobileViTV2Config) -> None: + super().__init__() + + encoder_out_channels = make_divisible(512 * config.width_multiplier, divisor=8) # layer 5 output dimension + in_channels = encoder_out_channels + out_channels = config.aspp_out_channels + + if len(config.atrous_rates) != 3: + raise ValueError("Expected 3 values for atrous_rates") + + self.convs = nn.ModuleList() + + in_projection = MobileViTV2ConvLayer( + config, + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + use_activation="relu", + ) + self.convs.append(in_projection) + + self.convs.extend( + [ + MobileViTV2ConvLayer( + config, + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + dilation=rate, + use_activation="relu", + ) + for rate in config.atrous_rates + ] + ) + + pool_layer = MobileViTV2ASPPPooling(config, in_channels, out_channels) + self.convs.append(pool_layer) + + self.project = MobileViTV2ConvLayer( + config, in_channels=5 * out_channels, out_channels=out_channels, kernel_size=1, use_activation="relu" + ) + + self.dropout = nn.Dropout(p=config.aspp_dropout_prob) + + def forward(self, features: torch.Tensor) -> torch.Tensor: + pyramid = [] + for conv in self.convs: + pyramid.append(conv(features)) + pyramid = torch.cat(pyramid, dim=1) + + pooled_features = self.project(pyramid) + pooled_features = self.dropout(pooled_features) + return pooled_features + + +# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTDeepLabV3 with MobileViT->MobileViTV2 +class MobileViTV2DeepLabV3(nn.Module): + """ + DeepLabv3 architecture: https://arxiv.org/abs/1706.05587 + """ + + def __init__(self, config: MobileViTV2Config) -> None: + super().__init__() + self.aspp = MobileViTV2ASPP(config) + + self.dropout = nn.Dropout2d(config.classifier_dropout_prob) + + self.classifier = MobileViTV2ConvLayer( + config, + in_channels=config.aspp_out_channels, + out_channels=config.num_labels, + kernel_size=1, + use_normalization=False, + use_activation=False, + bias=True, + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + features = self.aspp(hidden_states[-1]) + features = self.dropout(features) + features = self.classifier(features) + return features + + +@add_start_docstrings( + """ + MobileViTV2 model with a semantic segmentation head on top, e.g. for Pascal VOC. + """, + MOBILEVITV2_START_DOCSTRING, +) +# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTForSemanticSegmentation with MOBILEVIT->MOBILEVITV2,MobileViT->MobileViTV2,mobilevit->mobilevitv2 +class MobileViTV2ForSemanticSegmentation(MobileViTV2PreTrainedModel): + def __init__(self, config: MobileViTV2Config) -> None: + super().__init__(config) + + self.num_labels = config.num_labels + self.mobilevitv2 = MobileViTV2Model(config, expand_output=False) + self.segmentation_head = MobileViTV2DeepLabV3(config) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(MOBILEVITV2_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=SemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, SemanticSegmenterOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): + Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy). + + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, MobileViTV2ForSemanticSegmentation + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("apple/deeplabv3-mobilevitv2-small") + >>> model = MobileViTV2ForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevitv2-small") + + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> with torch.no_grad(): + ... outputs = model(**inputs) + + >>> # logits are of shape (batch_size, num_labels, height, width) + >>> logits = outputs.logits + ```""" + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.mobilevitv2( + pixel_values, + output_hidden_states=True, # we need the intermediate hidden states + return_dict=return_dict, + ) + + encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1] + + logits = self.segmentation_head(encoder_hidden_states) + + loss = None + if labels is not None: + if self.config.num_labels == 1: + raise ValueError("The number of labels should be greater than one") + else: + # upsample logits to the images' original size + upsampled_logits = nn.functional.interpolate( + logits, size=labels.shape[-2:], mode="bilinear", align_corners=False + ) + loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index) + loss = loss_fct(upsampled_logits, labels) + + if not return_dict: + if output_hidden_states: + output = (logits,) + outputs[1:] + else: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SemanticSegmenterOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=None, + ) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index c32528c4c7..069edb3ba8 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -4743,6 +4743,37 @@ class MobileViTPreTrainedModel(metaclass=DummyObject): requires_backends(self, ["torch"]) +MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class MobileViTV2ForImageClassification(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class MobileViTV2ForSemanticSegmentation(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class MobileViTV2Model(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class MobileViTV2PreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + MPNET_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/tests/models/mobilevitv2/__init__.py b/tests/models/mobilevitv2/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/models/mobilevitv2/test_modeling_mobilevitv2.py b/tests/models/mobilevitv2/test_modeling_mobilevitv2.py new file mode 100644 index 0000000000..0da4afd375 --- /dev/null +++ b/tests/models/mobilevitv2/test_modeling_mobilevitv2.py @@ -0,0 +1,383 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the PyTorch MobileViTV2 model. """ + + +import inspect +import unittest + +from transformers import MobileViTV2Config +from transformers.testing_utils import require_torch, require_vision, slow, torch_device +from transformers.utils import cached_property, is_torch_available, is_vision_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import MobileViTV2ForImageClassification, MobileViTV2ForSemanticSegmentation, MobileViTV2Model + from transformers.models.mobilevitv2.modeling_mobilevitv2 import ( + MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST, + make_divisible, + ) + + +if is_vision_available(): + from PIL import Image + + from transformers import MobileViTImageProcessor + + +class MobileViTV2ConfigTester(ConfigTester): + def create_and_test_config_common_properties(self): + config = self.config_class(**self.inputs_dict) + self.parent.assertTrue(hasattr(config, "width_multiplier")) + + +class MobileViTV2ModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=64, + patch_size=2, + num_channels=3, + hidden_act="swish", + conv_kernel_size=3, + output_stride=32, + classifier_dropout_prob=0.1, + initializer_range=0.02, + is_training=True, + use_labels=True, + num_labels=10, + scope=None, + width_multiplier=0.25, + ffn_dropout=0.0, + attn_dropout=0.0, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.last_hidden_size = make_divisible(512 * width_multiplier, divisor=8) + self.hidden_act = hidden_act + self.conv_kernel_size = conv_kernel_size + self.output_stride = output_stride + self.classifier_dropout_prob = classifier_dropout_prob + self.use_labels = use_labels + self.is_training = is_training + self.num_labels = num_labels + self.initializer_range = initializer_range + self.scope = scope + self.width_multiplier = width_multiplier + self.ffn_dropout_prob = ffn_dropout + self.attn_dropout_prob = attn_dropout + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + pixel_labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.num_labels) + pixel_labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels) + + config = self.get_config() + + return config, pixel_values, labels, pixel_labels + + def get_config(self): + return MobileViTV2Config( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_act=self.hidden_act, + conv_kernel_size=self.conv_kernel_size, + output_stride=self.output_stride, + classifier_dropout_prob=self.classifier_dropout_prob, + initializer_range=self.initializer_range, + width_multiplier=self.width_multiplier, + ffn_dropout=self.ffn_dropout_prob, + attn_dropout=self.attn_dropout_prob, + ) + + def create_and_check_model(self, config, pixel_values, labels, pixel_labels): + model = MobileViTV2Model(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual( + result.last_hidden_state.shape, + ( + self.batch_size, + self.last_hidden_size, + self.image_size // self.output_stride, + self.image_size // self.output_stride, + ), + ) + + def create_and_check_for_image_classification(self, config, pixel_values, labels, pixel_labels): + config.num_labels = self.num_labels + model = MobileViTV2ForImageClassification(config) + model.to(torch_device) + model.eval() + result = model(pixel_values, labels=labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_for_semantic_segmentation(self, config, pixel_values, labels, pixel_labels): + config.num_labels = self.num_labels + model = MobileViTV2ForSemanticSegmentation(config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual( + result.logits.shape, + ( + self.batch_size, + self.num_labels, + self.image_size // self.output_stride, + self.image_size // self.output_stride, + ), + ) + result = model(pixel_values, labels=pixel_labels) + self.parent.assertEqual( + result.logits.shape, + ( + self.batch_size, + self.num_labels, + self.image_size // self.output_stride, + self.image_size // self.output_stride, + ), + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels, pixel_labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class MobileViTV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as MobileViTV2 does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = ( + (MobileViTV2Model, MobileViTV2ForImageClassification, MobileViTV2ForSemanticSegmentation) + if is_torch_available() + else () + ) + + pipeline_model_mapping = ( + { + "feature-extraction": MobileViTV2Model, + "image-classification": MobileViTV2ForImageClassification, + "image-segmentation": MobileViTV2ForSemanticSegmentation, + } + if is_torch_available() + else {} + ) + + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + has_attentions = False + + def setUp(self): + self.model_tester = MobileViTV2ModelTester(self) + self.config_tester = MobileViTV2ConfigTester(self, config_class=MobileViTV2Config, has_text_modality=False) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="MobileViTV2 does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="MobileViTV2 does not support input and output embeddings") + def test_model_common_attributes(self): + pass + + @unittest.skip(reason="MobileViTV2 does not output attentions") + def test_attention_outputs(self): + pass + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.hidden_states + + expected_num_stages = 5 + self.assertEqual(len(hidden_states), expected_num_stages) + + # MobileViTV2's feature maps are of shape (batch_size, num_channels, height, width) + # with the width and height being successively divided by 2. + divisor = 2 + for i in range(len(hidden_states)): + self.assertListEqual( + list(hidden_states[i].shape[-2:]), + [self.model_tester.image_size // divisor, self.model_tester.image_size // divisor], + ) + divisor *= 2 + + self.assertEqual(self.model_tester.output_stride, divisor // 2) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + def test_for_semantic_segmentation(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_semantic_segmentation(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + for model_name in MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = MobileViTV2Model.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +class MobileViTV2ModelIntegrationTest(unittest.TestCase): + @cached_property + def default_image_processor(self): + return ( + MobileViTImageProcessor.from_pretrained("shehan97/mobilevitv2-1.0-imagenet1k-256") + if is_vision_available() + else None + ) + + @slow + def test_inference_image_classification_head(self): + model = MobileViTV2ForImageClassification.from_pretrained("shehan97/mobilevitv2-1.0-imagenet1k-256").to( + torch_device + ) + + image_processor = self.default_image_processor + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the logits + expected_shape = torch.Size((1, 1000)) + self.assertEqual(outputs.logits.shape, expected_shape) + + expected_slice = torch.tensor([-1.6336e00, -7.3204e-02, -5.1883e-01]).to(torch_device) + + self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4)) + + @slow + def test_inference_semantic_segmentation(self): + model = MobileViTV2ForSemanticSegmentation.from_pretrained("shehan97/mobilevitv2-1.0-voc-deeplabv3") + model = model.to(torch_device) + + image_processor = MobileViTImageProcessor.from_pretrained("shehan97/mobilevitv2-1.0-voc-deeplabv3") + + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + logits = outputs.logits + + # verify the logits + expected_shape = torch.Size((1, 21, 32, 32)) + self.assertEqual(logits.shape, expected_shape) + + expected_slice = torch.tensor( + [ + [[7.0863, 7.1525, 6.8201], [6.6931, 6.8770, 6.8933], [6.2978, 7.0366, 6.9636]], + [[-3.7134, -3.6712, -3.6675], [-3.5825, -3.3549, -3.4777], [-3.3435, -3.3979, -3.2857]], + [[-2.9329, -2.8003, -2.7369], [-3.0564, -2.4780, -2.0207], [-2.6889, -1.9298, -1.7640]], + ], + device=torch_device, + ) + + self.assertTrue(torch.allclose(logits[0, :3, :3, :3], expected_slice, atol=1e-4)) + + @slow + def test_post_processing_semantic_segmentation(self): + model = MobileViTV2ForSemanticSegmentation.from_pretrained("shehan97/mobilevitv2-1.0-voc-deeplabv3") + model = model.to(torch_device) + + image_processor = MobileViTImageProcessor.from_pretrained("shehan97/mobilevitv2-1.0-voc-deeplabv3") + + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + outputs.logits = outputs.logits.detach().cpu() + + segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(50, 60)]) + expected_shape = torch.Size((50, 60)) + self.assertEqual(segmentation[0].shape, expected_shape) + + segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs) + expected_shape = torch.Size((32, 32)) + self.assertEqual(segmentation[0].shape, expected_shape)