From 1b37fb5e17add5e65f7076f4e88225e11156c107 Mon Sep 17 00:00:00 2001 From: Bartosz Szmelczynski <43574448+Bearnardd@users.noreply.github.com> Date: Fri, 20 Jan 2023 09:35:42 +0100 Subject: [PATCH] Efficientformer (#20459) - Adds EfficientFormer V1 to transformers - PR co-authored by @novice03 and @Bearnardd Co-authored-by: novice Co-authored-by: novice <44259234+novice03@users.noreply.github.com> --- README.md | 1 + README_es.md | 1 + README_hd.md | 1 + README_ja.md | 1 + README_ko.md | 1 + README_zh-hans.md | 1 + README_zh-hant.md | 1 + docs/source/en/_toctree.yml | 2 + docs/source/en/index.mdx | 2 + docs/source/en/model_doc/efficientformer.mdx | 65 ++ src/transformers/__init__.py | 20 + src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 3 + .../models/auto/image_processing_auto.py | 1 + src/transformers/models/auto/modeling_auto.py | 8 + .../models/efficientformer/__init__.py | 80 ++ .../configuration_efficientformer.py | 167 ++++ ..._original_pytorch_checkpoint_to_pytorch.py | 252 ++++++ .../image_processing_efficientformer.py | 339 ++++++++ .../modeling_efficientformer.py | 795 ++++++++++++++++++ src/transformers/utils/dummy_pt_objects.py | 31 + .../utils/dummy_vision_objects.py | 7 + tests/models/efficientformer/__init__.py | 0 .../test_image_processing_efficientformer.py | 192 +++++ .../test_modeling_efficientformer.py | 466 ++++++++++ 25 files changed, 2438 insertions(+) create mode 100644 docs/source/en/model_doc/efficientformer.mdx create mode 100644 src/transformers/models/efficientformer/__init__.py create mode 100644 src/transformers/models/efficientformer/configuration_efficientformer.py create mode 100644 src/transformers/models/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py create mode 100644 src/transformers/models/efficientformer/image_processing_efficientformer.py create mode 100644 src/transformers/models/efficientformer/modeling_efficientformer.py create mode 100644 tests/models/efficientformer/__init__.py create mode 100644 tests/models/efficientformer/test_image_processing_efficientformer.py create mode 100644 tests/models/efficientformer/test_modeling_efficientformer.py diff --git a/README.md b/README.md index 0a6fd088a8..b26d470737 100644 --- a/README.md +++ b/README.md @@ -316,6 +316,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park. 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. +1. **[EfficientFormer](https://huggingface.co/docs/transformers/main/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren. 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu. diff --git a/README_es.md b/README_es.md index 4a2e0180bf..e702a3f2ec 100644 --- a/README_es.md +++ b/README_es.md @@ -309,6 +309,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park. 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. +1. **[EfficientFormer](https://huggingface.co/docs/transformers/main/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren. 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu. diff --git a/README_hd.md b/README_hd.md index 93320d3fae..8ae6aba204 100644 --- a/README_hd.md +++ b/README_hd.md @@ -281,6 +281,7 @@ conda install -c huggingface transformers 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (NAVER से) साथ में कागज [OCR-मुक्त डॉक्यूमेंट अंडरस्टैंडिंग ट्रांसफॉर्मर](https://arxiv.org/abs /2111.15664) गीवूक किम, टीकग्यू होंग, मूनबिन यिम, जियोंग्योन नाम, जिनयॉन्ग पार्क, जिनयॉन्ग यिम, वोनसेओक ह्वांग, सांगडू यूं, डोंगयून हान, सेउंग्युन पार्क द्वारा। 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (फेसबुक से) साथ में पेपर [ओपन-डोमेन क्वेश्चन आंसरिंग के लिए डेंस पैसेज रिट्रीवल](https://arxiv. org/abs/2004.04906) व्लादिमीर करपुखिन, बरलास ओज़ुज़, सेवन मिन, पैट्रिक लुईस, लेडेल वू, सर्गेई एडुनोव, डैनकी चेन, और वेन-ताऊ यिह द्वारा। 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (इंटेल लैब्स से) साथ में कागज [विज़न ट्रांसफॉर्मर्स फॉर डेंस प्रेडिक्शन](https://arxiv.org /abs/2103.13413) रेने रैनफ्टल, एलेक्सी बोचकोवस्की, व्लादलेन कोल्टन द्वारा। +1. **[EfficientFormer](https://huggingface.co/docs/transformers/main/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren. 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (Google रिसर्च/स्टैनफोर्ड यूनिवर्सिटी से) साथ में दिया गया पेपर [इलेक्ट्रा: जेनरेटर के बजाय भेदभाव करने वाले के रूप में टेक्स्ट एन्कोडर्स का पूर्व-प्रशिक्षण] (https://arxiv.org/abs/2003.10555) केविन क्लार्क, मिन्ह-थांग लुओंग, क्वोक वी. ले, क्रिस्टोफर डी. मैनिंग द्वारा पोस्ट किया गया। 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (Google रिसर्च से) साथ में दिया गया पेपर [सीक्वेंस जेनरेशन टास्क के लिए प्री-ट्रेंड चेकपॉइंट का इस्तेमाल करना](https:/ /arxiv.org/abs/1907.12461) साशा रोठे, शशि नारायण, अलियाक्सि सेवेरिन द्वारा। 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)**(Baidu से) साथ देने वाला पेपर [ERNIE: एन्हांस्ड रिप्रेजेंटेशन थ्रू नॉलेज इंटीग्रेशन](https://arxiv.org/abs/1904.09223) यू सन, शुओहुआन वांग, युकुन ली, शिकुन फेंग, ज़ुई चेन, हान झांग, शिन तियान, डैनक्सियांग झू, हाओ तियान, हुआ वू द्वारा पोस्ट किया गया। diff --git a/README_ja.md b/README_ja.md index f70ae637b3..c1fe5813e8 100644 --- a/README_ja.md +++ b/README_ja.md @@ -343,6 +343,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (NAVER から), Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park から公開された研究論文: [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (Facebook から) Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih から公開された研究論文: [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (Intel Labs から) René Ranftl, Alexey Bochkovskiy, Vladlen Koltun から公開された研究論文: [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) +1. **[EfficientFormer](https://huggingface.co/docs/transformers/main/model_doc/efficientformer)** (Snap Research から) Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren. から公開された研究論文 [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (Google Research/Stanford University から) Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning から公開された研究論文: [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (Google Research から) Sascha Rothe, Shashi Narayan, Aliaksei Severyn から公開された研究論文: [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (Baidu から) Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu から公開された研究論文: [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) diff --git a/README_ko.md b/README_ko.md index c5c3354a2c..f2b928f806 100644 --- a/README_ko.md +++ b/README_ko.md @@ -258,6 +258,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (NAVER 에서) Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park 의 [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) 논문과 함께 발표했습니다. 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (Facebook 에서) Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 의 [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 논문과 함께 발표했습니다. 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (Intel Labs 에서) René Ranftl, Alexey Bochkovskiy, Vladlen Koltun 의 [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) 논문과 함께 발표했습니다. +1. **[EfficientFormer](https://huggingface.co/docs/transformers/main/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren. 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (Google Research/Stanford University 에서) Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 의 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 논문과 함께 발표했습니다. 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (Google Research 에서) Sascha Rothe, Shashi Narayan, Aliaksei Severyn 의 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 논문과 함께 발표했습니다. 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (Baidu 에서) Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu 의 [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) 논문과 함께 발표했습니다. diff --git a/README_zh-hans.md b/README_zh-hans.md index 3d61804395..7a111fd12d 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -282,6 +282,7 @@ conda install -c huggingface transformers 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (来自 NAVER) 伴随论文 [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) 由 Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park 发布。 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (来自 Facebook) 伴随论文 [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 由 Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 发布。 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (来自 Intel Labs) 伴随论文 [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) 由 René Ranftl, Alexey Bochkovskiy, Vladlen Koltun 发布。 +1. **[EfficientFormer](https://huggingface.co/docs/transformers/main/model_doc/efficientformer)** (来自 Snap Research) 伴随论文 [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) 由 Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren 发布。 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (来自 Google Research/Stanford University) 伴随论文 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 由 Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 发布。 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (来自 Google Research) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (来自 Baidu) 伴随论文 [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu 发布。 diff --git a/README_zh-hant.md b/README_zh-hant.md index 57c13b88cb..123a076ed4 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -294,6 +294,7 @@ conda install -c huggingface transformers 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER) released with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park. 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. +1. **[EfficientFormer](https://huggingface.co/docs/transformers/main/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren. 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu. diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 15aedb0cb7..31a34b9e88 100755 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -420,6 +420,8 @@ title: DiT - local: model_doc/dpt title: DPT + - local: model_doc/efficientformer + title: EfficientFormer - local: model_doc/glpn title: GLPN - local: model_doc/imagegpt diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx index bdc4685346..321bbc29b6 100644 --- a/docs/source/en/index.mdx +++ b/docs/source/en/index.mdx @@ -95,6 +95,7 @@ The documentation is organized into five sections: 1. **[Donut](model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park. 1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. 1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. +1. **[EfficientFormer](model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren. 1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. 1. **[EncoderDecoder](model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. 1. **[ERNIE](model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu. @@ -274,6 +275,7 @@ Flax), PyTorch, and/or TensorFlow. | DonutSwin | ❌ | ❌ | ✅ | ❌ | ❌ | | DPR | ✅ | ✅ | ✅ | ✅ | ❌ | | DPT | ❌ | ❌ | ✅ | ❌ | ❌ | +| EfficientFormer | ❌ | ❌ | ✅ | ❌ | ❌ | | ELECTRA | ✅ | ✅ | ✅ | ✅ | ✅ | | Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ | | ERNIE | ❌ | ❌ | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/efficientformer.mdx b/docs/source/en/model_doc/efficientformer.mdx new file mode 100644 index 0000000000..aba86f1317 --- /dev/null +++ b/docs/source/en/model_doc/efficientformer.mdx @@ -0,0 +1,65 @@ + + +# EfficientFormer + +## Overview + +The EfficientFormer model was proposed in [EfficientFormer: Vision Transformers at MobileNet Speed](https://arxiv.org/abs/2206.01191) +by Yanyu Li, Geng Yuan, Yang Wen, Eric Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren. EfficientFormer proposes a +dimension-consistent pure transformer that can be run on mobile devices for dense prediction tasks like image classification, object +detection and semantic segmentation. + +The abstract from the paper is the following: + +*Vision Transformers (ViT) have shown rapid progress in computer vision tasks, achieving promising results on various benchmarks. +However, due to the massive number of parameters and model design, e.g., attention mechanism, ViT-based models are generally +times slower than lightweight convolutional networks. Therefore, the deployment of ViT for real-time applications is particularly +challenging, especially on resource-constrained hardware such as mobile devices. Recent efforts try to reduce the computation +complexity of ViT through network architecture search or hybrid design with MobileNet block, yet the inference speed is still +unsatisfactory. This leads to an important question: can transformers run as fast as MobileNet while obtaining high performance? +To answer this, we first revisit the network architecture and operators used in ViT-based models and identify inefficient designs. +Then we introduce a dimension-consistent pure transformer (without MobileNet blocks) as a design paradigm. +Finally, we perform latency-driven slimming to get a series of final models dubbed EfficientFormer. +Extensive experiments show the superiority of EfficientFormer in performance and speed on mobile devices. +Our fastest model, EfficientFormer-L1, achieves 79.2% top-1 accuracy on ImageNet-1K with only 1.6 ms inference latency on +iPhone 12 (compiled with CoreML), which { runs as fast as MobileNetV2×1.4 (1.6 ms, 74.7% top-1),} and our largest model, +EfficientFormer-L7, obtains 83.3% accuracy with only 7.0 ms latency. Our work proves that properly designed transformers can +reach extremely low latency on mobile devices while maintaining high performance.* + +This model was contributed by [novice03](https://huggingface.co/novice03) and [Bearnardd](https://huggingface.co/Bearnardd). +The original code can be found [here](https://github.com/snap-research/EfficientFormer). + + +## EfficientFormerConfig + +[[autodoc]] EfficientFormerConfig + +## EfficientFormerImageProcessor + +[[autodoc]] EfficientFormerImageProcessor + - preprocess + +## EfficientFormerModel + +[[autodoc]] EfficientFormerModel + - forward + +## EfficientFormerForImageClassification + +[[autodoc]] EfficientFormerForImageClassification + - forward + +## EfficientFormerForImageClassificationWithTeacher + +[[autodoc]] EfficientFormerForImageClassificationWithTeacher + - forward diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index e01e502587..ad99e29745 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -244,6 +244,7 @@ _import_structure = { "DPRReaderTokenizer", ], "models.dpt": ["DPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DPTConfig"], + "models.efficientformer": ["EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "EfficientFormerConfig"], "models.electra": ["ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP", "ElectraConfig", "ElectraTokenizer"], "models.encoder_decoder": ["EncoderDecoderConfig"], "models.ernie": [ @@ -791,6 +792,7 @@ else: _import_structure["models.detr"].extend(["DetrFeatureExtractor", "DetrImageProcessor"]) _import_structure["models.donut"].extend(["DonutFeatureExtractor", "DonutImageProcessor"]) _import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"]) + _import_structure["models.efficientformer"].append("EfficientFormerImageProcessor") _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"]) _import_structure["models.glpn"].extend(["GLPNFeatureExtractor", "GLPNImageProcessor"]) _import_structure["models.imagegpt"].extend(["ImageGPTFeatureExtractor", "ImageGPTImageProcessor"]) @@ -1374,6 +1376,15 @@ else: "DPTPreTrainedModel", ] ) + _import_structure["models.efficientformer"].extend( + [ + "EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST", + "EfficientFormerForImageClassification", + "EfficientFormerForImageClassificationWithTeacher", + "EfficientFormerModel", + "EfficientFormerPreTrainedModel", + ] + ) _import_structure["models.electra"].extend( [ "ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -3658,6 +3669,7 @@ if TYPE_CHECKING: DPRReaderTokenizer, ) from .models.dpt import DPT_PRETRAINED_CONFIG_ARCHIVE_MAP, DPTConfig + from .models.efficientformer import EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, EfficientFormerConfig from .models.electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig, ElectraTokenizer from .models.encoder_decoder import EncoderDecoderConfig from .models.ernie import ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP, ErnieConfig @@ -4133,6 +4145,7 @@ if TYPE_CHECKING: from .models.detr import DetrFeatureExtractor, DetrImageProcessor from .models.donut import DonutFeatureExtractor, DonutImageProcessor from .models.dpt import DPTFeatureExtractor, DPTImageProcessor + from .models.efficientformer import EfficientFormerImageProcessor from .models.flava import FlavaFeatureExtractor, FlavaImageProcessor, FlavaProcessor from .models.glpn import GLPNFeatureExtractor, GLPNImageProcessor from .models.imagegpt import ImageGPTFeatureExtractor, ImageGPTImageProcessor @@ -4619,6 +4632,13 @@ if TYPE_CHECKING: DPTModel, DPTPreTrainedModel, ) + from .models.efficientformer import ( + EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, + EfficientFormerForImageClassification, + EfficientFormerForImageClassificationWithTeacher, + EfficientFormerModel, + EfficientFormerPreTrainedModel, + ) from .models.electra import ( ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST, ElectraForCausalLM, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 5e3d849d7b..8ebc8e9f47 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -65,6 +65,7 @@ from . import ( donut, dpr, dpt, + efficientformer, electra, encoder_decoder, ernie, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 2ab1bbfcc5..c80259a134 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -69,6 +69,7 @@ CONFIG_MAPPING_NAMES = OrderedDict( ("donut-swin", "DonutSwinConfig"), ("dpr", "DPRConfig"), ("dpt", "DPTConfig"), + ("efficientformer", "EfficientFormerConfig"), ("electra", "ElectraConfig"), ("encoder-decoder", "EncoderDecoderConfig"), ("ernie", "ErnieConfig"), @@ -233,6 +234,7 @@ CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict( ("donut-swin", "DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("dpr", "DPR_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("dpt", "DPT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("efficientformer", "EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("electra", "ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("ernie", "ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("esm", "ESM_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -393,6 +395,7 @@ MODEL_NAMES_MAPPING = OrderedDict( ("donut-swin", "DonutSwin"), ("dpr", "DPR"), ("dpt", "DPT"), + ("efficientformer", "EfficientFormer"), ("electra", "ELECTRA"), ("encoder-decoder", "Encoder decoder"), ("ernie", "ERNIE"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index a957abd0c0..ca0621a784 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -53,6 +53,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict( ("dinat", "ViTImageProcessor"), ("donut-swin", "DonutImageProcessor"), ("dpt", "DPTImageProcessor"), + ("efficientformer", "EfficientFormerImageProcessor"), ("flava", "FlavaImageProcessor"), ("git", ("CLIPImageProcessor", "VideoMAEImageProcessor")), ("glpn", "GLPNImageProcessor"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index b5eaf1e0e4..ce3de79bd4 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -69,6 +69,7 @@ MODEL_MAPPING_NAMES = OrderedDict( ("donut-swin", "DonutSwinModel"), ("dpr", "DPRQuestionEncoder"), ("dpt", "DPTModel"), + ("efficientformer", "EfficientFormerModel"), ("electra", "ElectraModel"), ("ernie", "ErnieModel"), ("esm", "EsmModel"), @@ -397,6 +398,13 @@ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict( ("data2vec-vision", "Data2VecVisionForImageClassification"), ("deit", ("DeiTForImageClassification", "DeiTForImageClassificationWithTeacher")), ("dinat", "DinatForImageClassification"), + ( + "efficientformer", + ( + "EfficientFormerForImageClassification", + "EfficientFormerForImageClassificationWithTeacher", + ), + ), ("imagegpt", "ImageGPTForImageClassification"), ("levit", ("LevitForImageClassification", "LevitForImageClassificationWithTeacher")), ("mobilenet_v1", "MobileNetV1ForImageClassification"), diff --git a/src/transformers/models/efficientformer/__init__.py b/src/transformers/models/efficientformer/__init__.py new file mode 100644 index 0000000000..8731556974 --- /dev/null +++ b/src/transformers/models/efficientformer/__init__.py @@ -0,0 +1,80 @@ +# flake8: noqa +# There's no way to ignore "F401 '...' imported but unused" warnings in this +# module, but to preserve other warnings. So, don't check this module at all. + +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available + + +_import_structure = { + "configuration_efficientformer": [ + "EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", + "EfficientFormerConfig", + ] +} + +try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["image_processing_efficientformer"] = ["EfficientFormerImageProcessor"] + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_efficientformer"] = [ + "EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST", + "EfficientFormerForImageClassification", + "EfficientFormerForImageClassificationWithTeacher", + "EfficientFormerModel", + "EfficientFormerPreTrainedModel", + ] + +if TYPE_CHECKING: + from .configuration_efficientformer import EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, EfficientFormerConfig + + try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .image_processing_efficientformer import EfficientFormerImageProcessor + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_efficientformer import ( + EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, + EfficientFormerForImageClassification, + EfficientFormerForImageClassificationWithTeacher, + EfficientFormerModel, + EfficientFormerPreTrainedModel, + ) + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/efficientformer/configuration_efficientformer.py b/src/transformers/models/efficientformer/configuration_efficientformer.py new file mode 100644 index 0000000000..6a2bc31eed --- /dev/null +++ b/src/transformers/models/efficientformer/configuration_efficientformer.py @@ -0,0 +1,167 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" EfficientFormer model configuration""" + +from typing import List + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + +EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "snap-research/efficientformer-l1-300": ( + "https://huggingface.co/snap-research/efficientformer-l1-300/resolve/main/config.json" + ), +} + + +class EfficientFormerConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of an [`EfficientFormerModel`]. It is used to + instantiate an EfficientFormer model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the EfficientFormer + [snap-research/efficientformer-l1](https://huggingface.co/snap-research/efficientformer-l1) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + depths (`List(int)`, *optional*, defaults to `[3, 2, 6, 4]`) + Depth of each stage. + hidden_sizes (`List(int)`, *optional*, defaults to `[48, 96, 224, 448]`) + Dimensionality of each stage. + downsamples (`List(bool)`, *optional*, defaults to `[True, True, True, True]`) + Whether or not to downsample inputs between two stages. + dim (`int`, *optional*, defaults to 448): + Number of channels in Meta3D layers + key_dim (`int`, *optional*, defaults to 32): + The size of the key in meta3D block. + attention_ratio (`int`, *optional*, defaults to 4): + Ratio of the dimension of the query and value to the dimension of the key in MSHA block + resolution (`int`, *optional*, defaults to 5) + Size of each patch + num_hidden_layers (`int`, *optional*, defaults to 5): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the 3D MetaBlock. + mlp_expansion_ratio (`int`, *optional*, defaults to 4): + Ratio of size of the hidden dimensionality of an MLP to the dimensionality of its input. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings and encoder. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + pool_size (`int`, *optional*, defaults to 3): + Kernel size of pooling layers. + downsample_patch_size (`int`, *optional*, defaults to 3): + The size of patches in downsampling layers. + downsample_stride (`int`, *optional*, defaults to 2): + The stride of convolution kernels in downsampling layers. + downsample_pad (`int`, *optional*, defaults to 1): + Padding in downsampling layers. + drop_path_rate (`int`, *optional*, defaults to 0): + Rate at which to increase dropout probability in DropPath. + num_meta3d_blocks (`int`, *optional*, defaults to 1): + The number of 3D MetaBlocks in the last stage. + distillation (`bool`, *optional*, defaults to `True`): + Whether to add a distillation head. + use_layer_scale (`bool`, *optional*, defaults to `True`): + Whether to scale outputs from token mixers. + layer_scale_init_value (`float`, *optional*, defaults to 1e-5): + Factor by which outputs from token mixers are scaled. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + + Example: + + ```python + >>> from transformers import EfficientFormerConfig, EfficientFormerModel + + >>> # Initializing a EfficientFormer efficientformer-l1 style configuration + >>> configuration = EfficientFormerConfig() + + >>> # Initializing a EfficientFormerModel (with random weights) from the efficientformer-l3 style configuration + >>> model = EfficientFormerModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "efficientformer" + + def __init__( + self, + depths: List[int] = [3, 2, 6, 4], + hidden_sizes: List[int] = [48, 96, 224, 448], + downsamples: List[bool] = [True, True, True, True], + dim: int = 448, + key_dim: int = 32, + attention_ratio: int = 4, + resolution: int = 7, + num_hidden_layers: int = 5, + num_attention_heads: int = 8, + mlp_expansion_ratio: int = 4, + hidden_dropout_prob: float = 0.0, + patch_size: int = 16, + num_channels: int = 3, + pool_size: int = 3, + downsample_patch_size: int = 3, + downsample_stride: int = 2, + downsample_pad: int = 1, + drop_path_rate: float = 0.0, + num_meta3d_blocks: int = 1, + distillation: bool = True, + use_layer_scale: bool = True, + layer_scale_init_value: float = 1e-5, + hidden_act: str = "gelu", + initializer_range: float = 0.02, + layer_norm_eps: float = 1e-12, + **kwargs + ) -> None: + super().__init__(**kwargs) + + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.hidden_sizes = hidden_sizes + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.patch_size = patch_size + self.num_channels = num_channels + self.depths = depths + self.mlp_expansion_ratio = mlp_expansion_ratio + self.downsamples = downsamples + self.dim = dim + self.key_dim = key_dim + self.attention_ratio = attention_ratio + self.resolution = resolution + self.pool_size = pool_size + self.downsample_patch_size = downsample_patch_size + self.downsample_stride = downsample_stride + self.downsample_pad = downsample_pad + self.drop_path_rate = drop_path_rate + self.num_meta3d_blocks = num_meta3d_blocks + self.distillation = distillation + self.use_layer_scale = use_layer_scale + self.layer_scale_init_value = layer_scale_init_value diff --git a/src/transformers/models/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py new file mode 100644 index 0000000000..342f1263d3 --- /dev/null +++ b/src/transformers/models/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py @@ -0,0 +1,252 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Convert EfficientFormer checkpoints from the original repository. + +URL: https://github.com/snap-research/EfficientFormer +""" + +import argparse +import re +from pathlib import Path + +import torch +from PIL import Image +from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor + +import requests +from transformers import ( + EfficientFormerConfig, + EfficientFormerForImageClassificationWithTeacher, + EfficientFormerImageProcessor, +) +from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling + + +def rename_key(old_name, num_meta4D_last_stage): + new_name = old_name + + if "patch_embed" in old_name: + _, layer, param = old_name.split(".") + + if layer == "0": + new_name = old_name.replace("0", "convolution1") + elif layer == "1": + new_name = old_name.replace("1", "batchnorm_before") + elif layer == "3": + new_name = old_name.replace("3", "convolution2") + else: + new_name = old_name.replace("4", "batchnorm_after") + + if "network" in old_name and re.search("\d\.\d", old_name): + two_digit_num = r"\b\d{2}\b" + if bool(re.search(two_digit_num, old_name)): + match = re.search("\d\.\d\d.", old_name).group() + else: + match = re.search("\d\.\d.", old_name).group() + if int(match[0]) < 6: + trimmed_name = old_name.replace(match, "") + trimmed_name = trimmed_name.replace("network", match[0] + ".meta4D_layers.blocks." + match[2:-1]) + new_name = "intermediate_stages." + trimmed_name + else: + trimmed_name = old_name.replace(match, "") + if int(match[2]) < num_meta4D_last_stage: + trimmed_name = trimmed_name.replace("network", "meta4D_layers.blocks." + match[2]) + else: + layer_index = str(int(match[2]) - num_meta4D_last_stage) + trimmed_name = trimmed_name.replace("network", "meta3D_layers.blocks." + layer_index) + if "norm1" in old_name: + trimmed_name = trimmed_name.replace("norm1", "layernorm1") + elif "norm2" in old_name: + trimmed_name = trimmed_name.replace("norm2", "layernorm2") + elif "fc1" in old_name: + trimmed_name = trimmed_name.replace("fc1", "linear_in") + elif "fc2" in old_name: + trimmed_name = trimmed_name.replace("fc2", "linear_out") + + new_name = "last_stage." + trimmed_name + + elif "network" in old_name and re.search(".\d.", old_name): + new_name = old_name.replace("network", "intermediate_stages") + + if "fc" in new_name: + new_name = new_name.replace("fc", "convolution") + elif ("norm1" in new_name) and ("layernorm1" not in new_name): + new_name = new_name.replace("norm1", "batchnorm_before") + elif ("norm2" in new_name) and ("layernorm2" not in new_name): + new_name = new_name.replace("norm2", "batchnorm_after") + if "proj" in new_name: + new_name = new_name.replace("proj", "projection") + if "dist_head" in new_name: + new_name = new_name.replace("dist_head", "distillation_classifier") + elif "head" in new_name: + new_name = new_name.replace("head", "classifier") + elif "patch_embed" in new_name: + new_name = "efficientformer." + new_name + elif new_name == "norm.weight" or new_name == "norm.bias": + new_name = new_name.replace("norm", "layernorm") + new_name = "efficientformer." + new_name + else: + new_name = "efficientformer.encoder." + new_name + + return new_name + + +def convert_torch_checkpoint(checkpoint, num_meta4D_last_stage): + for key in checkpoint.copy().keys(): + val = checkpoint.pop(key) + checkpoint[rename_key(key, num_meta4D_last_stage)] = val + + return checkpoint + + +# We will verify our results on a COCO image +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw) + + return image + + +def convert_efficientformer_checkpoint( + checkpoint_path: Path, efficientformer_config_file: Path, pytorch_dump_path: Path, push_to_hub: bool +): + orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] + config = EfficientFormerConfig.from_json_file(efficientformer_config_file) + model = EfficientFormerForImageClassificationWithTeacher(config) + model_name = "_".join(checkpoint_path.split("/")[-1].split(".")[0].split("_")[:-1]) + + num_meta4D_last_stage = config.depths[-1] - config.num_meta3d_blocks + 1 + new_state_dict = convert_torch_checkpoint(orig_state_dict, num_meta4D_last_stage) + + model.load_state_dict(new_state_dict) + model.eval() + + pillow_resamplings = { + "bilinear": PILImageResampling.BILINEAR, + "bicubic": PILImageResampling.BICUBIC, + "nearest": PILImageResampling.NEAREST, + } + + # prepare image + image = prepare_img() + image_size = 256 + crop_size = 224 + processor = EfficientFormerImageProcessor( + size={"shortest_edge": image_size}, + crop_size={"height": crop_size, "width": crop_size}, + resample=pillow_resamplings["bicubic"], + ) + pixel_values = processor(images=image, return_tensors="pt").pixel_values + + # original processing pipeline + image_transforms = Compose( + [ + Resize(image_size, interpolation=pillow_resamplings["bicubic"]), + CenterCrop(crop_size), + ToTensor(), + Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD), + ] + ) + original_pixel_values = image_transforms(image).unsqueeze(0) + + assert torch.allclose(original_pixel_values, pixel_values) + + outputs = model(pixel_values) + logits = outputs.logits + + expected_shape = (1, 1000) + + if "l1" in model_name: + expected_logits = torch.Tensor( + [-0.1312, 0.4353, -1.0499, -0.5124, 0.4183, -0.6793, -1.3777, -0.0893, -0.7358, -2.4328] + ) + assert torch.allclose(logits[0, :10], expected_logits, atol=1e-3) + assert logits.shape == expected_shape + elif "l3" in model_name: + expected_logits = torch.Tensor( + [-1.3150, -1.5456, -1.2556, -0.8496, -0.7127, -0.7897, -0.9728, -0.3052, 0.3751, -0.3127] + ) + assert torch.allclose(logits[0, :10], expected_logits, atol=1e-3) + assert logits.shape == expected_shape + elif "l7" in model_name: + expected_logits = torch.Tensor( + [-1.0283, -1.4131, -0.5644, -1.3115, -0.5785, -1.2049, -0.7528, 0.1992, -0.3822, -0.0878] + ) + assert logits.shape == expected_shape + else: + raise ValueError( + f"Unknown model checkpoint: {checkpoint_path}. Supported version of efficientformer are l1, l3 and l7" + ) + + # Save Checkpoints + Path(pytorch_dump_path).mkdir(exist_ok=True) + model.save_pretrained(pytorch_dump_path) + print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}") + processor.save_pretrained(pytorch_dump_path) + print(f"Processor successfuly saved at {pytorch_dump_path}") + + if push_to_hub: + print("Pushing model to the hub...") + + model.push_to_hub( + repo_id=f"Bearnardd/{pytorch_dump_path}", + commit_message="Add model", + use_temp_dir=True, + ) + processor.push_to_hub( + repo_id=f"Bearnardd/{pytorch_dump_path}", + commit_message="Add feature extractor", + use_temp_dir=True, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--pytorch_model_path", + default=None, + type=str, + required=True, + help="Path to EfficientFormer pytorch checkpoint.", + ) + parser.add_argument( + "--config_file", + default=None, + type=str, + required=True, + help="The json file for EfficientFormer model config.", + ) + parser.add_argument( + "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." + ) + + parser.add_argument("--push_to_hub", action="store_true", help="Push model and feature extractor to the hub") + parser.add_argument( + "--no-push_to_hub", + dest="push_to_hub", + action="store_false", + help="Do not push model and feature extractor to the hub", + ) + parser.set_defaults(push_to_hub=True) + + args = parser.parse_args() + convert_efficientformer_checkpoint( + checkpoint_path=args.pytorch_model_path, + efficientformer_config_file=args.config_file, + pytorch_dump_path=args.pytorch_dump_path, + push_to_hub=args.push_to_hub, + ) diff --git a/src/transformers/models/efficientformer/image_processing_efficientformer.py b/src/transformers/models/efficientformer/image_processing_efficientformer.py new file mode 100644 index 0000000000..963946f266 --- /dev/null +++ b/src/transformers/models/efficientformer/image_processing_efficientformer.py @@ -0,0 +1,339 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for EfficientFormer.""" + +from typing import Dict, List, Optional, Union + +import numpy as np + +from transformers.utils.generic import TensorType + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import ( + center_crop, + get_resize_output_image_size, + normalize, + rescale, + resize, + to_channel_dimension_format, +) +from ...image_utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + is_batched, + to_numpy_array, + valid_images, +) +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class EfficientFormerImageProcessor(BaseImageProcessor): + r""" + Constructs a EfficientFormer image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `(size["height"], + size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method. + size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the + `preprocess` method. + do_center_crop (`bool`, *optional*, defaults to `True`): + Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the + `preprocess` method. + crop_size (`Dict[str, int]` *optional*, defaults to 224): + Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess` + method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` + parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the + `preprocess` method. + do_normalize: + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. + image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[Dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_center_crop: bool = True, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + crop_size: Dict[str, int] = None, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + **kwargs + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 224, "width": 224} + size = get_size_dict(size) + crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224} + crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size") + + self.do_resize = do_resize + self.do_rescale = do_rescale + self.do_normalize = do_normalize + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.size = size + self.resample = resample + self.rescale_factor = rescale_factor + self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD + + def resize( + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BILINEAR, + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs + ) -> np.ndarray: + """ + Resize an image to `(size["height"], size["width"])`. + + Args: + image (`np.ndarray`): + Image to resize. + size (`Dict[str, int]`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + resample: + `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + + Returns: + `np.ndarray`: The resized image. + """ + size = get_size_dict(size) + + if "shortest_edge" in size: + size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False) + # size = get_resize_output_image_size(image, size["shortest_edge"], size["longest_edge"]) + elif "height" in size and "width" in size: + size = (size["height"], size["width"]) + else: + raise ValueError(f"Size must contain 'height' and 'width' keys or 'shortest_edge' key. Got {size.keys()}") + return resize(image, size=size, resample=resample, data_format=data_format, **kwargs) + + def center_crop( + self, + image: np.ndarray, + size: Dict[str, int], + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs + ) -> np.ndarray: + """ + Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the + returned result will always be of size `size`). + + Args: + image (`np.ndarray`): + Image to center crop. + size (`Dict[str, int]`): + Size of the output image in the form of a dictionary with keys `height` and `width`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + """ + size = get_size_dict(size) + if "height" not in size or "width" not in size: + raise ValueError(f"The `size` parameter must contain the keys (height, width). Got {size.keys()}") + return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs) + + def rescale( + self, image: np.ndarray, scale: float, data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs + ) -> np.ndarray: + """ + Rescale an image by a scale factor. image = image * scale. + + Args: + image (`np.ndarray`): + Image to rescale. + scale (`float`): + The scaling factor to rescale pixel values by. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + + Returns: + `np.ndarray`: The rescaled image. + """ + return rescale(image, scale=scale, data_format=data_format, **kwargs) + + def normalize( + self, + image: np.ndarray, + mean: Union[float, List[float]], + std: Union[float, List[float]], + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs + ) -> np.ndarray: + """ + Normalize an image. image = (image - image_mean) / image_std. + + Args: + image (`np.ndarray`): + Image to normalize. + mean (`float` or `List[float]`): + Image mean to use for normalization. + std (`float` or `List[float]`): + Image standard deviation to use for normalization. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + + Returns: + `np.ndarray`: The normalized image. + """ + return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs) + + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: bool = None, + crop_size: int = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + **kwargs, + ) -> BatchFeature: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after + resizing. + resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`): + `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has + an effect if `do_resize` is set to `True`. + do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`): + Whether to center crop the image. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`): + Size of the center crop. Only has an effect if `do_center_crop` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use if `do_normalize` is set to `True`. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop + crop_size = crop_size if crop_size is not None else self.crop_size + crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True) + resample = resample if resample is not None else self.resample + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + + size = size if size is not None else self.size + size_dict = get_size_dict(size) + + if not is_batched(images): + images = [images] + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + if do_resize and size is None: + raise ValueError("Size must be specified if do_resize is True.") + + if do_center_crop and crop_size is None: + raise ValueError("Crop size must be specified if do_center_crop is True.") + + if do_rescale and rescale_factor is None: + raise ValueError("Rescale factor must be specified if do_rescale is True.") + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_resize: + images = [self.resize(image=image, size=size_dict, resample=resample) for image in images] + + if do_center_crop: + images = [self.center_crop(image=image, size=crop_size) for image in images] + + if do_rescale: + images = [self.rescale(image=image, scale=rescale_factor) for image in images] + + if do_normalize: + images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images] + + images = [to_channel_dimension_format(image, data_format) for image in images] + + data = {"pixel_values": images} + return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/src/transformers/models/efficientformer/modeling_efficientformer.py b/src/transformers/models/efficientformer/modeling_efficientformer.py new file mode 100644 index 0000000000..376f429c3f --- /dev/null +++ b/src/transformers/models/efficientformer/modeling_efficientformer.py @@ -0,0 +1,795 @@ +# coding=utf-8 +# Copyright 2022 Snapchat Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch EfficientFormer model.""" + +import itertools +from dataclasses import dataclass +from typing import Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput +from ...modeling_utils import PreTrainedModel +from ...utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, +) +from .configuration_efficientformer import EfficientFormerConfig + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "EfficientFormerConfig" +_FEAT_EXTRACTOR_FOR_DOC = "EfficientFormerImageProcessor" + +# Base docstring +_CHECKPOINT_FOR_DOC = "efficientformer-l1-300" +_EXPECTED_OUTPUT_SHAPE = [1, 197, 768] + +# Image classification docstring +_IMAGE_CLASS_CHECKPOINT = "snap-research/efficientformer-l1-300" +_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat" + + +EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "huggingface/efficientformer-l1-300", + # See all EfficientFormer models at https://huggingface.co/models?filter=efficientformer +] + + +class EfficientFormerPatchEmbeddings(nn.Module): + """ + This class performs downsampling between two stages. For the input tensor with the shape [batch_size, num_channels, + height, width] it produces output tensor with the shape [batch_size, num_channels, height/stride, width/stride] + """ + + def __init__(self, config: EfficientFormerConfig, num_channels: int, embed_dim: int, apply_norm: bool = True): + super().__init__() + self.num_channels = num_channels + + self.projection = nn.Conv2d( + num_channels, + embed_dim, + kernel_size=config.downsample_patch_size, + stride=config.downsample_stride, + padding=config.downsample_pad, + ) + self.norm = nn.BatchNorm2d(embed_dim) if apply_norm else nn.Identity() + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + batch_size, num_channels, height, width = pixel_values.shape + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + ) + + embeddings = self.projection(pixel_values) + embeddings = self.norm(embeddings) + + return embeddings + + +class EfficientFormerSelfAttention(nn.Module): + def __init__(self, dim: int, key_dim: int, num_heads: int, attention_ratio: int, resolution: int): + super().__init__() + + self.num_heads = num_heads + self.key_dim = key_dim + self.attention_ratio = attention_ratio + self.scale = key_dim**-0.5 + self.total_key_dim = key_dim * num_heads + self.expanded_key_dim = int(attention_ratio * key_dim) + self.total_expanded_key_dim = int(self.expanded_key_dim * num_heads) + hidden_size = self.total_expanded_key_dim + self.total_key_dim * 2 + self.qkv = nn.Linear(dim, hidden_size) + self.projection = nn.Linear(self.total_expanded_key_dim, dim) + points = list(itertools.product(range(resolution), range(resolution))) + num_points = len(points) + attention_offsets = {} + idxs = [] + for point_1 in points: + for point_2 in points: + offset = (abs(point_1[0] - point_2[0]), abs(point_1[1] - point_2[1])) + if offset not in attention_offsets: + attention_offsets[offset] = len(attention_offsets) + idxs.append(attention_offsets[offset]) + self.attention_biases = torch.nn.Parameter(torch.zeros(num_heads, len(attention_offsets))) + self.register_buffer("attention_bias_idxs", torch.LongTensor(idxs).view(num_points, num_points)) + + @torch.no_grad() + def train(self, mode=True): + super().train(mode) + if mode and hasattr(self, "ab"): + del self.ab + else: + self.ab = self.attention_biases[:, self.attention_bias_idxs] + + def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> Tuple[torch.Tensor]: + batch_size, sequence_length, num_channels = hidden_states.shape + qkv = self.qkv(hidden_states) + query_layer, key_layer, value_layer = qkv.reshape(batch_size, sequence_length, self.num_heads, -1).split( + [self.key_dim, self.key_dim, self.expanded_key_dim], dim=3 + ) + query_layer = query_layer.permute(0, 2, 1, 3) + key_layer = key_layer.permute(0, 2, 1, 3) + value_layer = value_layer.permute(0, 2, 1, 3) + + attention_probs = (torch.matmul(query_layer, key_layer.transpose(-2, -1))) * self.scale + ( + self.attention_biases[:, self.attention_bias_idxs] if self.training else self.ab + ) + + attention_probs = attention_probs.softmax(dim=-1) + + context_layer = torch.matmul(attention_probs, value_layer).transpose(1, 2) + context_layer = context_layer.reshape(batch_size, sequence_length, self.total_expanded_key_dim) + context_layer = self.projection(context_layer) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class EfficientFormerConvStem(nn.Module): + def __init__(self, config: EfficientFormerConfig, out_channels: int): + super().__init__() + + self.convolution1 = nn.Conv2d(config.num_channels, out_channels // 2, kernel_size=3, stride=2, padding=1) + self.batchnorm_before = nn.BatchNorm2d(out_channels // 2) + + self.convolution2 = nn.Conv2d(out_channels // 2, out_channels, kernel_size=3, stride=2, padding=1) + self.batchnorm_after = nn.BatchNorm2d(out_channels) + + self.activation = nn.ReLU() + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + features = self.batchnorm_before(self.convolution1(pixel_values)) + features = self.activation(features) + features = self.batchnorm_after(self.convolution2(features)) + features = self.activation(features) + + return features + + +class EfficientFormerPooling(nn.Module): + def __init__(self, pool_size: int): + super().__init__() + self.pool = nn.AvgPool2d(pool_size, stride=1, padding=pool_size // 2, count_include_pad=False) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = self.pool(hidden_states) - hidden_states + return output + + +class EfficientFormerDenseMlp(nn.Module): + def __init__( + self, + config: EfficientFormerConfig, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + + self.linear_in = nn.Linear(in_features, hidden_features) + self.activation = ACT2FN[config.hidden_act] + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.linear_out = nn.Linear(hidden_features, out_features) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.linear_in(hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.linear_out(hidden_states) + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +class EfficientFormerConvMlp(nn.Module): + def __init__( + self, + config: EfficientFormerConfig, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + drop: float = 0.0, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + + self.convolution1 = nn.Conv2d(in_features, hidden_features, 1) + self.actvation = ACT2FN[config.hidden_act] + self.convolution2 = nn.Conv2d(hidden_features, out_features, 1) + self.dropout = nn.Dropout(drop) + + self.batchnorm_before = nn.BatchNorm2d(hidden_features) + self.batchnorm_after = nn.BatchNorm2d(out_features) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.convolution1(hidden_state) + hidden_state = self.batchnorm_before(hidden_state) + + hidden_state = self.actvation(hidden_state) + hidden_state = self.dropout(hidden_state) + hidden_state = self.convolution2(hidden_state) + + hidden_state = self.batchnorm_after(hidden_state) + + hidden_state = self.dropout(hidden_state) + return hidden_state + + +# Copied from transformers.models.convnext.modeling_convnext.drop_path +def drop_path(input, drop_prob: float = 0.0, training: bool = False): + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Bit +class EfficientFormerDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return "p={}".format(self.drop_prob) + + +class EfficientFormerFlat(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]: + hidden_states = hidden_states.flatten(2).transpose(1, 2) + return hidden_states + + +class EfficientFormerMeta3D(nn.Module): + def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0): + super().__init__() + + self.token_mixer = EfficientFormerSelfAttention( + dim=config.dim, + key_dim=config.key_dim, + num_heads=config.num_attention_heads, + attention_ratio=config.attention_ratio, + resolution=config.resolution, + ) + self.layernorm1 = nn.LayerNorm(dim) + self.layernorm2 = nn.LayerNorm(dim) + mlp_hidden_dim = int(dim * config.mlp_expansion_ratio) + self.mlp = EfficientFormerDenseMlp(config, in_features=dim, hidden_features=mlp_hidden_dim) + + self.drop_path = EfficientFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.use_layer_scale = config.use_layer_scale + if config.use_layer_scale: + self.layer_scale_1 = nn.Parameter(config.layer_scale_init_value * torch.ones((dim)), requires_grad=True) + self.layer_scale_2 = nn.Parameter(config.layer_scale_init_value * torch.ones((dim)), requires_grad=True) + + def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> Tuple[torch.Tensor]: + self_attention_outputs = self.token_mixer(self.layernorm1(hidden_states), output_attentions) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + if self.use_layer_scale: + layer_output = hidden_states + self.drop_path( + self.layer_scale_1.unsqueeze(0).unsqueeze(0) * attention_output + ) + layer_output = layer_output + self.drop_path( + self.layer_scale_2.unsqueeze(0).unsqueeze(0) * self.mlp(self.layernorm2(layer_output)) + ) + else: + layer_output = hidden_states + self.drop_path(attention_output) + layer_output = layer_output + self.drop_path(self.mlp(self.layernorm2(layer_output))) + + outputs = (layer_output,) + outputs + + return outputs + + +class EfficientFormerMeta3DLayers(nn.Module): + def __init__(self, config: EfficientFormerConfig): + super().__init__() + drop_paths = [ + config.drop_path_rate * (block_idx + sum(config.depths[:-1])) + for block_idx in range(config.num_meta3d_blocks) + ] + self.blocks = nn.ModuleList( + [EfficientFormerMeta3D(config, config.hidden_sizes[-1], drop_path=drop_path) for drop_path in drop_paths] + ) + + def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> Tuple[torch.Tensor]: + all_attention_outputs = () if output_attentions else None + for layer_module in self.blocks: + if isinstance(hidden_states, tuple): + hidden_states = hidden_states[0] + hidden_states = layer_module(hidden_states, output_attentions) + if output_attentions: + all_attention_outputs = all_attention_outputs + (hidden_states[1],) + if output_attentions: + outputs = (hidden_states[0],) + all_attention_outputs + return outputs + return hidden_states + + +class EfficientFormerMeta4D(nn.Module): + def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0): + super().__init__() + pool_size = config.pool_size if config.pool_size is not None else 3 + self.token_mixer = EfficientFormerPooling(pool_size=pool_size) + mlp_hidden_dim = int(dim * config.mlp_expansion_ratio) + self.mlp = EfficientFormerConvMlp( + config, in_features=dim, hidden_features=mlp_hidden_dim, drop=config.hidden_dropout_prob + ) + + self.drop_path = EfficientFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.use_layer_scale = config.use_layer_scale + if config.use_layer_scale: + self.layer_scale_1 = nn.Parameter(config.layer_scale_init_value * torch.ones((dim)), requires_grad=True) + self.layer_scale_2 = nn.Parameter(config.layer_scale_init_value * torch.ones((dim)), requires_grad=True) + + def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]: + outputs = self.token_mixer(hidden_states) + + if self.use_layer_scale: + layer_output = hidden_states + self.drop_path(self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * outputs) + layer_output = layer_output + self.drop_path( + self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * self.mlp(layer_output) + ) + else: + layer_output = hidden_states + self.drop_path(outputs) + layer_output = layer_output + self.drop_path(self.mlp(layer_output)) + + return layer_output + + +class EfficientFormerMeta4DLayers(nn.Module): + def __init__(self, config: EfficientFormerConfig, stage_idx: int): + super().__init__() + num_layers = ( + config.depths[stage_idx] if stage_idx != -1 else config.depths[stage_idx] - config.num_meta3d_blocks + ) + drop_paths = [ + config.drop_path_rate * (block_idx + sum(config.depths[:stage_idx])) for block_idx in range(num_layers) + ] + self.blocks = nn.ModuleList( + [ + EfficientFormerMeta4D(config, config.hidden_sizes[stage_idx], drop_path=drop_path) + for drop_path in drop_paths + ] + ) + + def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]: + for layer_module in self.blocks: + hidden_states = layer_module(hidden_states) + return hidden_states + + +class EfficientFormerIntermediateStage(nn.Module): + def __init__(self, config: EfficientFormerConfig, index: int): + super().__init__() + self.meta4D_layers = EfficientFormerMeta4DLayers(config, index) + + def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]: + hidden_states = self.meta4D_layers(hidden_states) + return hidden_states + + +class EfficientFormerLastStage(nn.Module): + def __init__(self, config: EfficientFormerConfig): + super().__init__() + self.meta4D_layers = EfficientFormerMeta4DLayers(config, -1) + self.flat = EfficientFormerFlat() + self.meta3D_layers = EfficientFormerMeta3DLayers(config) + + def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> Tuple[torch.Tensor]: + hidden_states = self.meta4D_layers(hidden_states) + hidden_states = self.flat(hidden_states) + hidden_states = self.meta3D_layers(hidden_states, output_attentions) + + return hidden_states + + +class EfficientFormerEncoder(nn.Module): + def __init__(self, config: EfficientFormerConfig): + super().__init__() + self.config = config + num_intermediate_stages = len(config.depths) - 1 + downsamples = [ + config.downsamples[i] or config.hidden_sizes[i] != config.hidden_sizes[i + 1] + for i in range(num_intermediate_stages) + ] + intermediate_stages = [] + for i in range(num_intermediate_stages): + intermediate_stages.append(EfficientFormerIntermediateStage(config, i)) + if downsamples[i]: + intermediate_stages.append( + EfficientFormerPatchEmbeddings(config, config.hidden_sizes[i], config.hidden_sizes[i + 1]) + ) + + self.intermediate_stages = nn.ModuleList(intermediate_stages) + self.last_stage = EfficientFormerLastStage(config) + + def forward( + self, + hidden_states: torch.Tensor, + output_hidden_states: bool = False, + output_attentions: bool = False, + return_dict: bool = True, + ) -> BaseModelOutput: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + for layer_module in self.intermediate_stages: + hidden_states = layer_module(hidden_states) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_output = self.last_stage(hidden_states, output_attentions=output_attentions) + if output_attentions: + all_self_attentions = all_self_attentions + layer_output[1:] + + if output_hidden_states: + all_hidden_states = all_hidden_states + (layer_output[0],) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + + return BaseModelOutput( + last_hidden_state=layer_output[0], + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class EfficientFormerPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = EfficientFormerConfig + base_model_prefix = "efficientformer" + main_input_name = "pixel_values" + supports_gradient_checkpointing = False + + def _init_weights(self, module: nn.Module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +EFFICIENTFORMER_START_DOCSTRING = r""" + This model is a PyTorch [nn.Module](https://pytorch.org/docs/stable/nn.html#nn.Module) subclass. Use it as a + regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. + + Parameters: + config ([`EfficientFormerConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +EFFICIENTFORMER_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`ViTFeatureExtractor`]. See + [`ViTFeatureExtractor.__call__`] for details. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare EfficientFormer Model transformer outputting raw hidden-states without any specific head on top.", + EFFICIENTFORMER_START_DOCSTRING, +) +class EfficientFormerModel(EfficientFormerPreTrainedModel): + def __init__(self, config: EfficientFormerConfig): + super().__init__(config) + self.config = config + + self.patch_embed = EfficientFormerConvStem(config, config.hidden_sizes[0]) + self.encoder = EfficientFormerEncoder(config) + self.layernorm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + processor_class=_FEAT_EXTRACTOR_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + modality="vision", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + embedding_output = self.patch_embed(pixel_values) + encoder_outputs = self.encoder( + embedding_output, output_attentions=output_attentions, output_hidden_states=output_hidden_states + ) + + sequence_output = encoder_outputs[0] + sequence_output = self.layernorm(sequence_output) + + if not return_dict: + head_outputs = (sequence_output,) + return head_outputs + encoder_outputs[1:] + + return BaseModelOutput( + last_hidden_state=sequence_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings( + """ + EfficientFormer Model transformer with an image classification head on top (a linear layer on top of the final + hidden state of the [CLS] token) e.g. for ImageNet. + """, + EFFICIENTFORMER_START_DOCSTRING, +) +class EfficientFormerForImageClassification(EfficientFormerPreTrainedModel): + def __init__(self, config: EfficientFormerConfig): + super().__init__(config) + + self.num_labels = config.num_labels + self.efficientformer = EfficientFormerModel(config) + + # Classifier head + self.classifier = ( + nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity() + ) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + processor_class=_FEAT_EXTRACTOR_FOR_DOC, + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=ImageClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, ImageClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.efficientformer( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.classifier(sequence_output.mean(-2)) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return ImageClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@dataclass +class EfficientFormerForImageClassificationWithTeacherOutput(ModelOutput): + """ + Output type of [`EfficientFormerForImageClassificationWithTeacher`]. + + Args: + logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`): + Prediction scores as the average of the cls_logits and distillation logits. + cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`): + Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the + class token). + distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`): + Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the + distillation token). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer + plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the self-attention heads. + """ + + logits: torch.FloatTensor = None + cls_logits: torch.FloatTensor = None + distillation_logits: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +@add_start_docstrings( + """ + EfficientFormer Model transformer with image classification heads on top (a linear layer on top of the final hidden + state of the [CLS] token and a linear layer on top of the final hidden state of the distillation token) e.g. for + ImageNet. + + + + This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet + supported. + + + """, + EFFICIENTFORMER_START_DOCSTRING, +) +class EfficientFormerForImageClassificationWithTeacher(EfficientFormerPreTrainedModel): + def __init__(self, config: EfficientFormerConfig): + super().__init__(config) + + self.num_labels = config.num_labels + self.efficientformer = EfficientFormerModel(config) + + # Classifier head + self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity() + # Distillation head + self.distillation_classifier = ( + nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity() + ) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + processor_class=_FEAT_EXTRACTOR_FOR_DOC, + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=EfficientFormerForImageClassificationWithTeacherOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, EfficientFormerForImageClassificationWithTeacherOutput]: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + outputs = self.efficientformer( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + cls_logits = self.classifier(sequence_output.mean(-2)) + distillation_logits = self.distillation_classifier(sequence_output.mean(-2)) + + # during inference, return the average of both classifier predictions + logits = (cls_logits + distillation_logits) / 2 + + if not return_dict: + output = (logits, cls_logits, distillation_logits) + outputs[1:] + return output + + return EfficientFormerForImageClassificationWithTeacherOutput( + logits=logits, + cls_logits=cls_logits, + distillation_logits=distillation_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index bc0cf04816..32737d93db 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -2149,6 +2149,37 @@ class DPTPreTrainedModel(metaclass=DummyObject): requires_backends(self, ["torch"]) +EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class EfficientFormerForImageClassification(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class EfficientFormerForImageClassificationWithTeacher(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class EfficientFormerModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class EfficientFormerPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index 18b1f07ef2..739ea3a618 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -171,6 +171,13 @@ class DPTImageProcessor(metaclass=DummyObject): requires_backends(self, ["vision"]) +class EfficientFormerImageProcessor(metaclass=DummyObject): + _backends = ["vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + class FlavaFeatureExtractor(metaclass=DummyObject): _backends = ["vision"] diff --git a/tests/models/efficientformer/__init__.py b/tests/models/efficientformer/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/models/efficientformer/test_image_processing_efficientformer.py b/tests/models/efficientformer/test_image_processing_efficientformer.py new file mode 100644 index 0000000000..c672b15be1 --- /dev/null +++ b/tests/models/efficientformer/test_image_processing_efficientformer.py @@ -0,0 +1,192 @@ +# coding=utf-8 +# Copyright 2021 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +import numpy as np + +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torch_available, is_vision_available + +from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + from PIL import Image + + from transformers import ViTFeatureExtractor + + +class EfficientFormerImageProcessorTester(unittest.TestCase): + def __init__( + self, + parent, + batch_size=13, + num_channels=3, + image_size=224, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_normalize=True, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + ): + size = size if size is not None else {"height": 18, "width": 18} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + + def prepare_feat_extract_dict(self): + return { + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_normalize": self.do_normalize, + "do_resize": self.do_resize, + "size": self.size, + } + + +@require_torch +@require_vision +class EfficientFormerImageProcessorTest(FeatureExtractionSavingTestMixin, unittest.TestCase): + + feature_extraction_class = ViTFeatureExtractor if is_vision_available() else None + + def setUp(self): + self.feature_extract_tester = EfficientFormerImageProcessorTester(self) + + @property + def feat_extract_dict(self): + return self.feature_extract_tester.prepare_feat_extract_dict() + + def test_feat_extract_properties(self): + feature_extractor = self.feature_extraction_class(**self.feat_extract_dict) + self.assertTrue(hasattr(feature_extractor, "image_mean")) + self.assertTrue(hasattr(feature_extractor, "image_std")) + self.assertTrue(hasattr(feature_extractor, "do_normalize")) + self.assertTrue(hasattr(feature_extractor, "do_resize")) + self.assertTrue(hasattr(feature_extractor, "size")) + + def test_batch_feature(self): + pass + + def test_call_pil(self): + # Initialize feature_extractor + feature_extractor = self.feature_extraction_class(**self.feat_extract_dict) + # create random PIL images + image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False) + for image in image_inputs: + self.assertIsInstance(image, Image.Image) + + # Test not batched input + encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values + self.assertEqual( + encoded_images.shape, + ( + 1, + self.feature_extract_tester.num_channels, + self.feature_extract_tester.size["height"], + self.feature_extract_tester.size["width"], + ), + ) + + # Test batched + encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values + self.assertEqual( + encoded_images.shape, + ( + self.feature_extract_tester.batch_size, + self.feature_extract_tester.num_channels, + self.feature_extract_tester.size["height"], + self.feature_extract_tester.size["width"], + ), + ) + + def test_call_numpy(self): + # Initialize feature_extractor + feature_extractor = self.feature_extraction_class(**self.feat_extract_dict) + # create random numpy tensors + image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True) + for image in image_inputs: + self.assertIsInstance(image, np.ndarray) + + # Test not batched input + encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values + self.assertEqual( + encoded_images.shape, + ( + 1, + self.feature_extract_tester.num_channels, + self.feature_extract_tester.size["height"], + self.feature_extract_tester.size["width"], + ), + ) + + # Test batched + encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values + self.assertEqual( + encoded_images.shape, + ( + self.feature_extract_tester.batch_size, + self.feature_extract_tester.num_channels, + self.feature_extract_tester.size["height"], + self.feature_extract_tester.size["width"], + ), + ) + + def test_call_pytorch(self): + # Initialize feature_extractor + feature_extractor = self.feature_extraction_class(**self.feat_extract_dict) + # create random PyTorch tensors + image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True) + for image in image_inputs: + self.assertIsInstance(image, torch.Tensor) + + # Test not batched input + encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values + self.assertEqual( + encoded_images.shape, + ( + 1, + self.feature_extract_tester.num_channels, + self.feature_extract_tester.size["height"], + self.feature_extract_tester.size["width"], + ), + ) + + # Test batched + encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values + self.assertEqual( + encoded_images.shape, + ( + self.feature_extract_tester.batch_size, + self.feature_extract_tester.num_channels, + self.feature_extract_tester.size["height"], + self.feature_extract_tester.size["width"], + ), + ) diff --git a/tests/models/efficientformer/test_modeling_efficientformer.py b/tests/models/efficientformer/test_modeling_efficientformer.py new file mode 100644 index 0000000000..4426b37e9b --- /dev/null +++ b/tests/models/efficientformer/test_modeling_efficientformer.py @@ -0,0 +1,466 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the PyTorch EfficientFormer model. """ + + +import inspect +import unittest +import warnings + +from transformers import EfficientFormerConfig +from transformers.models.auto import get_values +from transformers.testing_utils import require_torch, require_vision, slow, torch_device +from transformers.utils import cached_property, is_torch_available, is_vision_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor + + +if is_torch_available(): + import torch + + from transformers import ( + MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, + MODEL_MAPPING, + EfficientFormerForImageClassification, + EfficientFormerForImageClassificationWithTeacher, + EfficientFormerModel, + ) + from transformers.models.efficientformer.modeling_efficientformer import ( + EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, + ) + + +if is_vision_available(): + from PIL import Image + + from transformers import EfficientFormerImageProcessor + + +class EfficientFormerModelTester: + def __init__( + self, + parent, + batch_size: int = 13, + image_size: int = 224, + patch_size: int = 2, + embed_dim: int = 48, # last embed dim of stem + num_channels: int = 3, + is_training: bool = True, + use_labels: bool = True, + hidden_size: int = 448, + num_hidden_layers: int = 7, # For the l1 + num_attention_heads: int = 8, + intermediate_size: int = 37, + hidden_act: str = "gelu", + hidden_dropout_prob: float = 0.1, + attention_probs_dropout_prob: float = 0.1, + type_sequence_label_size: int = 10, + initializer_range: float = 0.02, + encoder_stride: int = 2, + num_attention_outputs: int = 1, # For l1 + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.encoder_stride = encoder_stride + self.num_attention_outputs = num_attention_outputs + self.embed_dim = embed_dim + self.seq_length = embed_dim + 1 + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + + config = self.get_config() + return config, pixel_values, labels + + def get_config(self): + return EfficientFormerConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + is_decoder=False, + initializer_range=self.initializer_range, + encoder_stride=self.encoder_stride, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = EfficientFormerModel(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_for_image_classification(self, config, pixel_values, labels): + config.num_labels = self.type_sequence_label_size + model = EfficientFormerForImageClassification(config) + model.to(torch_device) + model.eval() + result = model(pixel_values, labels=labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + # test greyscale images + config.num_channels = 1 + model = EfficientFormerForImageClassification(config) + model.to(torch_device) + model.eval() + + pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size]) + result = model(pixel_values) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + pixel_values, + labels, + ) = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class EfficientFormerModelTest(ModelTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as EfficientFormer does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = ( + ( + EfficientFormerModel, + EfficientFormerForImageClassificationWithTeacher, + EfficientFormerForImageClassification, + ) + if is_torch_available() + else () + ) + fx_compatible = False + + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + + def setUp(self): + self.model_tester = EfficientFormerModelTester(self) + self.config_tester = ConfigTester( + self, config_class=EfficientFormerConfig, has_text_modality=False, hidden_size=37 + ) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="EfficientFormer does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="EfficientFormer does not support input and output embeddings") + def test_model_common_attributes(self): + pass + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + ) + self.assertEqual(len(hidden_states), expected_num_layers) + + if hasattr(self.model_tester, "encoder_seq_length"): + seq_length = self.model_tester.encoder_seq_length + if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1: + seq_length = seq_length * self.model_tester.chunk_length + else: + seq_length = self.model_tester.seq_length + + self.assertListEqual( + list(hidden_states[-1].shape[-2:]), + [seq_length, self.model_tester.hidden_size], + ) + + if config.is_encoder_decoder: + hidden_states = outputs.decoder_hidden_states + + self.assertIsInstance(hidden_states, (list, tuple)) + self.assertEqual(len(hidden_states), expected_num_layers) + seq_len = getattr(self.model_tester, "seq_length", None) + decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len) + + self.assertListEqual( + list(hidden_states[-1].shape[-2:]), + [decoder_seq_length, self.model_tester.hidden_size], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + if model_class.__name__ == "EfficientFormerForImageClassificationWithTeacher": + del inputs_dict["labels"] + + return inputs_dict + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @unittest.skip(reason="EfficientFormer does not implement masked image modeling yet") + def test_for_masked_image_modeling(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + # special case for EfficientFormerForImageClassificationWithTeacher model + def test_training(self): + if not self.model_tester.is_training: + return + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + for model_class in self.all_model_classes: + # EfficientFormerForImageClassificationWithTeacher supports inference-only + if ( + model_class in get_values(MODEL_MAPPING) + or model_class.__name__ == "EfficientFormerForImageClassificationWithTeacher" + ): + continue + model = model_class(config) + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + def test_problem_types(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + problem_types = [ + {"title": "multi_label_classification", "num_labels": 2, "dtype": torch.float}, + {"title": "single_label_classification", "num_labels": 1, "dtype": torch.long}, + {"title": "regression", "num_labels": 1, "dtype": torch.float}, + ] + + for model_class in self.all_model_classes: + if ( + model_class + not in [ + *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING), + ] + or model_class.__name__ == "EfficientFormerForImageClassificationWithTeacher" + ): + continue + + for problem_type in problem_types: + with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"): + + config.problem_type = problem_type["title"] + config.num_labels = problem_type["num_labels"] + + model = model_class(config) + model.to(torch_device) + model.train() + + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + + if problem_type["num_labels"] > 1: + inputs["labels"] = inputs["labels"].unsqueeze(1).repeat(1, problem_type["num_labels"]) + + inputs["labels"] = inputs["labels"].to(problem_type["dtype"]) + + # This tests that we do not trigger the warning form PyTorch "Using a target size that is different + # to the input size. This will likely lead to incorrect results due to broadcasting. Please ensure + # they have the same size." which is a symptom something in wrong for the regression problem. + # See https://github.com/huggingface/transformers/issues/11780 + with warnings.catch_warnings(record=True) as warning_list: + loss = model(**inputs).loss + for w in warning_list: + if "Using a target size that is different to the input size" in str(w.message): + raise ValueError( + f"Something is going wrong in the regression problem: intercepted {w.message}" + ) + + loss.backward() + + @slow + def test_model_from_pretrained(self): + for model_name in EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = EfficientFormerModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + seq_len = getattr(self.model_tester, "seq_length", None) + encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len) + encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) + chunk_length = getattr(self.model_tester, "chunk_length", None) + if chunk_length is not None and hasattr(self.model_tester, "num_hashes"): + encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_attention_outputs) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_attention_outputs) + + if chunk_length is not None: + self.assertListEqual( + list(attentions[0].shape[-4:]), + [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length], + ) + else: + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + ) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +class EfficientFormerModelIntegrationTest(unittest.TestCase): + @cached_property + def default_feature_extractor(self): + return ( + EfficientFormerImageProcessor.from_pretrained("snap-research/efficientformer-l1-300") + if is_vision_available() + else None + ) + + @slow + def test_inference_image_classification_head(self): + model = EfficientFormerForImageClassification.from_pretrained("snap-research/efficientformer-l1-300").to( + torch_device + ) + + feature_extractor = self.default_feature_extractor + image = prepare_img() + inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the logits + expected_shape = (1, 1000) + self.assertEqual(outputs.logits.shape, expected_shape) + + expected_slice = torch.tensor([-0.0555, 0.4825, -0.0852]).to(torch_device) + self.assertTrue(torch.allclose(outputs.logits[0][:3], expected_slice, atol=1e-4)) + + @slow + def test_inference_image_classification_head_with_teacher(self): + model = EfficientFormerForImageClassificationWithTeacher.from_pretrained( + "snap-research/efficientformer-l1-300" + ).to(torch_device) + + feature_extractor = self.default_feature_extractor + image = prepare_img() + inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the logits + expected_shape = (1, 1000) + self.assertEqual(outputs.logits.shape, expected_shape) + + expected_slice = torch.tensor([-0.1312, 0.4353, -1.0499]).to(torch_device) + self.assertTrue(torch.allclose(outputs.logits[0][:3], expected_slice, atol=1e-4))