From 3a6e4a221c8532a446ea551f0a907d22033aa15f Mon Sep 17 00:00:00 2001 From: Anahita Bhiwandiwalla Date: Wed, 25 Jan 2023 11:04:32 -0800 Subject: [PATCH] Add BridgeTower model (#20775) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Commit with BTModel and latest HF code * Placeholder classes for BTForMLM and BTForITR * Importing Bert classes from transformers * Removed objectives.py and dist_utils.py * Removed swin_transformer.py * Add image normalization, BridgeTowerForImageAndTextRetrieval * Add center_crop * Removing bert tokenizer and LCI references * Tested config loading from HF transformers hub * Removed state_dict updates and added path to hub * Enable center crop * Getting image_size from config, renaming num_heads and num_layers * Handling max_length in BridgeTowerProcessor * Add BridgeTowerForMaskedLM * Add doc string for BridgeTowerConfig * Add doc strings for BT config, processor, image processor * Adding docs, removed swin * Removed convert_bridgetower_original_to_pytorch.py * Added doc files for bridgetower, removed is_vision * Add support attention_mask=None and BridgeTowerModelOutput * Fix formatting * Fixes with 'make style', 'make quality', 'make fixup' * Remove downstream tasks from BridgeTowerModel * Formatting fixes, add return_dict to BT models * Clean up after doc_test * Update BTModelOutput return type, fix todo in doc * Remove loss_names from init * implement tests and update tuples returned by models * Add image reference to bridgetower.mdx * after make fix-copies, make fixup, make style, make quality, make repo-consistency * Rename class names with BridgeTower prefix * Fix for image_size in BTImageProcessor * implement feature extraction bridgetower tests * Update image_mean and image_std to be list * remove unused import * Removed old comments * Rework CLIP * update config in tests followed config update * Formatting fixes * Add copied from for BridgeTowerPredictionHeadTransform * Update bridgetower.mdx * Update test_feature_extraction_bridgetower.py * Update bridgetower.mdx * BridgeTowerForMaskedLM is conditioned on image too * Add BridgeTowerForMaskedLM * Fixes * Call post_init to init weights * Move freeze layers into method * Remove BTFeatureExtractor, add BT under multimodal models * Remove BTFeatureExtractor, add BT under multimodal models * Code review feedback - cleanup * Rename variables * Formatting and style to PR review feedback * Move center crop after resize * Use named parameters * Style fix for modeling_bridgetower.py * Update docs/source/en/model_doc/bridgetower.mdx Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update docs/source/en/model_doc/bridgetower.mdx Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update docs/source/en/model_doc/bridgetower.mdx Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/bridgetower/modeling_bridgetower.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/bridgetower/modeling_bridgetower.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update docs/source/en/model_doc/bridgetower.mdx Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> * Update src/transformers/models/bridgetower/modeling_bridgetower.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Rename config params, copy BERT classes, clean comments * Cleanup irtr * Replace Roberta imports, add BTTextConfig and Model * Update docs, add visionconfig, consistent arg names * make fixup * Comments for forward in BTModel and make fixup * correct tests * Remove inconsistent roberta copied from * Add BridgeTowerTextModel to dummy_pt_objects.py * Add BridgeTowerTextModel to IGNORE_NON_TESTED * Update docs for BT Text and Vision Configs * Treat BridgeTowerTextModel as a private model * BridgeTowerTextModel as private * Run make fix-copies * Adding BTTextModel to PRIVATE_MODELS * Fix for issue with BT Text and Image configs * make style changes * Update README_ja.md Add から to BridgeTower's description * Clean up config, .mdx and arg names * Fix init_weights. Remove nn.Sequential * Formatting and style fixes * Re-add tie_word_embeddings in config * update test implementation * update style * remove commented out * fix style * Update README with abs for BridgeTower * fix style * fix mdx file * Update bridgetower.mdx * Update img src in bridgetower.mdx * Update README.md * Update README.md * resolve style failed * Update _toctree.yml * Update README_ja.md * Removed mlp_ratio, rename feats, rename BTCLIPModel * Replace BTCLIP with BTVisionModel,pass in vision_config to BTVisionModel * Add test_initialization support * Add support for output_hidden_states * Update support for output_hidden_states * Add support for output_attentions * Add docstring for output_hidden_states * update tests * add bridgetowervisionmodel as private model * rerun the PR test * Remove model_type, pass configs to classes, renames * Change self.device to use weight device * Remove image_size * Style check fixes * Add hidden_size and num_hidden_layers to BridgeTowerTransformer * Update device setting * cosmetic update * trigger test again * trigger tests again * Update test_modeling_bridgetower.py trigger tests again * Update test_modeling_bridgetower.py * minor update * re-trigger tests * Update docs/source/en/model_doc/bridgetower.mdx Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Remove pad, update max_text_len, doc cleanup, pass eps to LayerNorm * Added copied to, some more review feedback * make fixup * Use BridgeTowerVisionEmbeddings * Code cleanup * Fixes for BridgeTowerVisionEmbeddings * style checks * re-tests * fix embedding * address comment on init file * retrigger tests * update import prepare_image_inputs * update test_image_processing_bridgetower.py to reflect test_image_processing_common.py * retrigger tests Co-authored-by: Shaoyen Tseng Co-authored-by: Tiep Le Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Co-authored-by: Tiep Le <97980157+tileintel@users.noreply.github.com> --- README.md | 1 + README_es.md | 1 + README_hd.md | 1 + README_ja.md | 1 + README_ko.md | 1 + README_zh-hans.md | 1 + README_zh-hant.md | 1 + docs/source/en/_toctree.yml | 4 +- docs/source/en/index.mdx | 2 + docs/source/en/model_doc/bridgetower.mdx | 140 ++ src/transformers/__init__.py | 32 + src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 3 + .../models/auto/image_processing_auto.py | 1 + src/transformers/models/auto/modeling_auto.py | 1 + .../models/auto/processing_auto.py | 1 + .../models/auto/tokenization_auto.py | 1 + .../models/bridgetower/__init__.py | 92 + .../bridgetower/configuration_bridgetower.py | 387 ++++ .../image_processing_bridgetower.py | 511 +++++ .../bridgetower/modeling_bridgetower.py | 1685 +++++++++++++++++ .../bridgetower/processing_bridgetower.py | 118 ++ src/transformers/utils/dummy_pt_objects.py | 31 + .../utils/dummy_vision_objects.py | 7 + tests/models/bridgetower/__init__.py | 0 .../test_image_processing_bridgetower.py | 258 +++ .../bridgetower/test_modeling_bridgetower.py | 409 ++++ utils/check_repo.py | 6 + 28 files changed, 3696 insertions(+), 1 deletion(-) create mode 100644 docs/source/en/model_doc/bridgetower.mdx create mode 100644 src/transformers/models/bridgetower/__init__.py create mode 100644 src/transformers/models/bridgetower/configuration_bridgetower.py create mode 100644 src/transformers/models/bridgetower/image_processing_bridgetower.py create mode 100644 src/transformers/models/bridgetower/modeling_bridgetower.py create mode 100644 src/transformers/models/bridgetower/processing_bridgetower.py create mode 100644 tests/models/bridgetower/__init__.py create mode 100644 tests/models/bridgetower/test_image_processing_bridgetower.py create mode 100644 tests/models/bridgetower/test_modeling_bridgetower.py diff --git a/README.md b/README.md index ff2e0a4612..71bdd607fb 100644 --- a/README.md +++ b/README.md @@ -289,6 +289,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi. 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/). 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry. +1. **[BridgeTower](https://huggingface.co/docs/transformers/main/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan. 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel. 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot. 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting. diff --git a/README_es.md b/README_es.md index e702a3f2ec..6552f84397 100644 --- a/README_es.md +++ b/README_es.md @@ -282,6 +282,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi. 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/). 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry. +1. **[BridgeTower](https://huggingface.co/docs/transformers/main/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan. 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel. 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot. 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting. diff --git a/README_hd.md b/README_hd.md index 8ae6aba204..e2bdfe7cfb 100644 --- a/README_hd.md +++ b/README_hd.md @@ -254,6 +254,7 @@ conda install -c huggingface transformers 1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi. 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigSicence Workshop](https://bigscience.huggingface.co/). 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (एलेक्सा से) कागज के साथ [बीईआरटी के लिए ऑप्टिमल सबआर्किटेक्चर एक्सट्रैक्शन](https://arxiv.org/abs/ 2010.10499) एड्रियन डी विंटर और डैनियल जे पेरी द्वारा। +1. **[BridgeTower](https://huggingface.co/docs/transformers/main/model_doc/bridgetower)** (हरबिन इंस्टिट्यूट ऑफ़ टेक्नोलॉजी/माइक्रोसॉफ्ट रिसर्च एशिया/इंटेल लैब्स से) कागज के साथ [ब्रिजटॉवर: विजन-लैंग्वेज रिप्रेजेंटेशन लर्निंग में एनकोडर्स के बीच ब्रिज बनाना]() by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan. 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (Google अनुसंधान से) साथ में कागज [ByT5: पूर्व-प्रशिक्षित बाइट-टू-बाइट मॉडल के साथ एक टोकन-मुक्त भविष्य की ओर] (https://arxiv.org/abs/2105.13626) Linting Xue, Aditya Barua, Noah Constant, रामी अल-रफू, शरण नारंग, मिहिर काले, एडम रॉबर्ट्स, कॉलिन रैफेल द्वारा पोस्ट किया गया। 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (इनरिया/फेसबुक/सोरबोन से) साथ में कागज [CamemBERT: एक टेस्टी फ्रेंच लैंग्वेज मॉडल](https:// arxiv.org/abs/1911.03894) लुई मार्टिन*, बेंजामिन मुलर*, पेड्रो जेवियर ऑर्टिज़ सुआरेज़*, योआन ड्यूपॉन्ट, लॉरेंट रोमरी, एरिक विलेमोन्टे डे ला क्लर्जरी, जैमे सेडाह और बेनोइट सगोट द्वारा। 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google रिसर्च से) साथ में दिया गया पेपर [कैनाइन: प्री-ट्रेनिंग ए एफिशिएंट टोकनाइजेशन-फ्री एनकोडर फॉर लैंग्वेज रिप्रेजेंटेशन]( https://arxiv.org/abs/2103.06874) जोनाथन एच क्लार्क, डैन गैरेट, यूलिया टर्क, जॉन विएटिंग द्वारा। diff --git a/README_ja.md b/README_ja.md index c1fe5813e8..15a56e7c70 100644 --- a/README_ja.md +++ b/README_ja.md @@ -316,6 +316,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (Salesforce から) Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi から公開された研究論文: [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (BigScience workshop から) [BigScience Workshop](https://bigscience.huggingface.co/) から公開されました. 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (Alexa から) Adrian de Wynter and Daniel J. Perry から公開された研究論文: [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) +1. **[BridgeTower](https://huggingface.co/docs/transformers/main/model_doc/bridgetower)** (Harbin Institute of Technology/Microsoft Research Asia/Intel Labs から) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan. 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (Google Research から) Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel から公開された研究論文: [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne から) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot から公開された研究論文: [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research から) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting から公開された研究論文: [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) diff --git a/README_ko.md b/README_ko.md index f2b928f806..be6b566969 100644 --- a/README_ko.md +++ b/README_ko.md @@ -231,6 +231,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi. 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/). 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (Alexa 에서) Adrian de Wynter and Daniel J. Perry 의 [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) 논문과 함께 발표했습니다. +1. **[BridgeTower](https://huggingface.co/docs/transformers/main/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan. 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (Google Research 에서) Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel 의 [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) 논문과 함께 발표했습니다. 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne 에서) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 의 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 논문과 함께 발표했습니다. 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research 에서) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 의 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 논문과 함께 발표했습니다. diff --git a/README_zh-hans.md b/README_zh-hans.md index 7a111fd12d..927b59890f 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -255,6 +255,7 @@ conda install -c huggingface transformers 1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (来自 Salesforce) 伴随论文 [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) 由 Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi 发布。 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/). 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (来自 Alexa) 伴随论文 [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) 由 Adrian de Wynter and Daniel J. Perry 发布。 +1. **[BridgeTower](https://huggingface.co/docs/transformers/main/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan. 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (来自 Google Research) 伴随论文 [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) 由 Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel 发布。 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (来自 Inria/Facebook/Sorbonne) 伴随论文 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 由 Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 发布。 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。 diff --git a/README_zh-hant.md b/README_zh-hant.md index 123a076ed4..f3b19fb209 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -267,6 +267,7 @@ conda install -c huggingface transformers 1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi. 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/). 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry. +1. **[BridgeTower](https://huggingface.co/docs/transformers/main/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan. 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel. 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot. 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting. diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 31a34b9e88..cf725653af 100755 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -516,6 +516,8 @@ title: AltCLIP - local: model_doc/blip title: BLIP + - local: model_doc/bridgetower + title: BridgeTower - local: model_doc/chinese_clip title: Chinese-CLIP - local: model_doc/clip @@ -595,4 +597,4 @@ - local: internal/file_utils title: General Utilities title: Internal Helpers - title: API \ No newline at end of file + title: API diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx index 321bbc29b6..e9140b7329 100644 --- a/docs/source/en/index.mdx +++ b/docs/source/en/index.mdx @@ -68,6 +68,7 @@ The documentation is organized into five sections: 1. **[BLIP](model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi. 1. **[BLOOM](model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/). 1. **[BORT](model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry. +1. **[BridgeTower](model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan. 1. **[ByT5](model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel. 1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot. 1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting. @@ -250,6 +251,7 @@ Flax), PyTorch, and/or TensorFlow. | BlenderbotSmall | ✅ | ✅ | ✅ | ✅ | ✅ | | BLIP | ❌ | ❌ | ✅ | ❌ | ❌ | | BLOOM | ❌ | ✅ | ✅ | ❌ | ❌ | +| BridgeTower | ❌ | ❌ | ✅ | ❌ | ❌ | | CamemBERT | ✅ | ✅ | ✅ | ✅ | ❌ | | CANINE | ✅ | ❌ | ✅ | ❌ | ❌ | | Chinese-CLIP | ❌ | ❌ | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/bridgetower.mdx b/docs/source/en/model_doc/bridgetower.mdx new file mode 100644 index 0000000000..87015877dc --- /dev/null +++ b/docs/source/en/model_doc/bridgetower.mdx @@ -0,0 +1,140 @@ + + +# BridgeTower + +## Overview + +The BridgeTower model was proposed in [BridgeTower: Building Bridges Between Encoders in Vision-Language Representative Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan. The goal of this model is to build a +bridge between each uni-modal encoder and the cross-modal encoder to enable comprehensive and detailed interaction at each layer of the cross-modal encoder thus achieving remarkable performance on various downstream tasks with almost negligible additional performance and computational costs. + +This paper has been accepted to the [AAAI'23](https://aaai.org/Conferences/AAAI-23/) conference. + +The abstract from the paper is the following: + +*Vision-Language (VL) models with the TWO-TOWER architecture have dominated visual-language representation learning in recent years. +Current VL models either use lightweight uni-modal encoders and learn to extract, align and fuse both modalities simultaneously in a deep cross-modal encoder, or feed the last-layer uni-modal representations from the deep pre-trained uni-modal encoders into the top cross-modal encoder. +Both approaches potentially restrict vision-language representation learning and limit model performance. In this paper, we propose BRIDGETOWER, which introduces multiple bridge layers that build a connection between the top layers of uni-modal encoders and each layer of the crossmodal encoder. +This enables effective bottom-up cross-modal alignment and fusion between visual and textual representations of different semantic levels of pre-trained uni-modal encoders in the cross-modal encoder. Pre-trained with only 4M images, BRIDGETOWER achieves state-of-the-art performance on various downstream vision-language tasks. +In particular, on the VQAv2 test-std set, BRIDGETOWER achieves an accuracy of 78.73%, outperforming the previous state-of-the-art model METER by 1.09% with the same pre-training data and almost negligible additional parameters and computational costs. +Notably, when further scaling the model, BRIDGETOWER achieves an accuracy of 81.15%, surpassing models that are pre-trained on orders-of-magnitude larger datasets.* + + + + BridgeTower architecture. Taken from the original paper. + +## Usage + +BridgeTower consists of a visual encoder, a textual encoder and cross-modal encoder with multiple lightweight bridge layers. +The goal of this approach was to build a bridge between each uni-modal encoder and the cross-modal encoder to enable comprehensive and detailed interaction at each layer of the cross-modal encoder. +In principle, one can apply any visual, textual or cross-modal encoder in the proposed architecture. + +The [`BridgeTowerProcessor`] wraps [`RobertaTokenizer`] and [`BridgeTowerImageProcessor`] into a single instance to both +encode the text and prepare the images respectively. + +The following example shows how to run image-text retrieval using [`BridgeTowerProcessor`] and [`BridgeTowerForImageAndTextRetrieval`]. +```python +>>> from transformers import BridgeTowerProcessor, BridgeTowerForImageAndTextRetrieval +>>> import requests +>>> from PIL import Image + +>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" +>>> image = Image.open(requests.get(url, stream=True).raw) +>>> texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"] + +>>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm") +>>> model = BridgeTowerForImageAndTextRetrieval.from_pretrained("BridgeTower/bridgetower-base-itm-mlm") + +>>> # forward pass +>>> scores = dict() +>>> for text in texts: +... # prepare inputs +... encoding = processor(image, text, return_tensors="pt") +... outputs = model(**encoding) +... scores[text] = outputs.logits[0, 1].item() +``` + +The following example shows how to run masked language modeling using [`BridgeTowerProcessor`] and [`BridgeTowerForMaskedLM`]. + +```python +>>> from transformers import BridgeTowerProcessor, BridgeTowerForMaskedLM +>>> from PIL import Image +>>> import requests + +>>> url = "http://images.cocodataset.org/val2017/000000360943.jpg" +>>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB") +>>> text = "a looking out of the window" + +>>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm") +>>> model = BridgeTowerForMaskedLM.from_pretrained("BridgeTower/bridgetower-base-itm-mlm") + +>>> # prepare inputs +>>> encoding = processor(image, text, return_tensors="pt") + +>>> # forward pass +>>> outputs = model(**encoding) + +>>> results = processor.decode(outputs.logits.argmax(dim=-1).squeeze(0).tolist()) + +>>> print(results) +.a cat looking out of the window. +``` + +This model was contributed by [Anahita Bhiwandiwalla](https://huggingface.co/anahita-b), [Tiep Le](https://huggingface.co/Tile) and [Shaoyen Tseng](https://huggingface.co/shaoyent). The original code can be found [here](https://github.com/microsoft/BridgeTower). + + +Tips: + +- This implementation of BridgeTower uses [`RobertaTokenizer`] to generate text embeddings and OpenAI's CLIP/ViT model to compute visual embeddings. +- Checkpoints for pre-trained [bridgeTower-base](https://huggingface.co/BridgeTower/bridgetower-base) and [bridgetower masked language modeling and image text matching](https://huggingface.co/BridgeTower/bridgetower-base-itm-mlm) are released. +- Please refer to [Table 5](https://arxiv.org/pdf/2206.08657.pdf) for BridgeTower's performance on Image Retrieval and other down stream tasks. +- The PyTorch version of this model is only available in torch 1.10 and higher. + + +## BridgeTowerConfig + +[[autodoc]] BridgeTowerConfig + +## BridgeTowerTextConfig + +[[autodoc]] BridgeTowerTextConfig + +## BridgeTowerVisionConfig + +[[autodoc]] BridgeTowerVisionConfig + +## BridgeTowerImageProcessor + +[[autodoc]] BridgeTowerImageProcessor + - preprocess + +## BridgeTowerProcessor + +[[autodoc]] BridgeTowerProcessor + - __call__ + +## BridgeTowerModel + +[[autodoc]] BridgeTowerModel + - forward + +## BridgeTowerForMaskedLM + +[[autodoc]] BridgeTowerForMaskedLM + - forward + +## BridgeTowerForImageAndTextRetrieval + +[[autodoc]] BridgeTowerForImageAndTextRetrieval + - forward + diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 03070da264..b65781ab75 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -185,6 +185,13 @@ _import_structure = { ], "models.bloom": ["BLOOM_PRETRAINED_CONFIG_ARCHIVE_MAP", "BloomConfig"], "models.bort": [], + "models.bridgetower": [ + "BRIDGETOWER_PRETRAINED_CONFIG_ARCHIVE_MAP", + "BridgeTowerConfig", + "BridgeTowerProcessor", + "BridgeTowerTextConfig", + "BridgeTowerVisionConfig", + ], "models.byt5": ["ByT5Tokenizer"], "models.camembert": ["CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "CamembertConfig"], "models.canine": ["CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP", "CanineConfig", "CanineTokenizer"], @@ -779,6 +786,7 @@ else: _import_structure["models.beit"].extend(["BeitFeatureExtractor", "BeitImageProcessor"]) _import_structure["models.bit"].extend(["BitImageProcessor"]) _import_structure["models.blip"].extend(["BlipImageProcessor"]) + _import_structure["models.bridgetower"].append("BridgeTowerImageProcessor") _import_structure["models.chinese_clip"].extend(["ChineseCLIPFeatureExtractor", "ChineseCLIPImageProcessor"]) _import_structure["models.clip"].extend(["CLIPFeatureExtractor", "CLIPImageProcessor"]) _import_structure["models.conditional_detr"].extend( @@ -1155,6 +1163,15 @@ else: "BloomPreTrainedModel", ] ) + _import_structure["models.bridgetower"].extend( + [ + "BRIDGETOWER_PRETRAINED_MODEL_ARCHIVE_LIST", + "BridgeTowerForImageAndTextRetrieval", + "BridgeTowerForMaskedLM", + "BridgeTowerModel", + "BridgeTowerPreTrainedModel", + ] + ) _import_structure["models.camembert"].extend( [ "CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -3611,6 +3628,13 @@ if TYPE_CHECKING: BlipVisionConfig, ) from .models.bloom import BLOOM_PRETRAINED_CONFIG_ARCHIVE_MAP, BloomConfig + from .models.bridgetower import ( + BRIDGETOWER_PRETRAINED_CONFIG_ARCHIVE_MAP, + BridgeTowerConfig, + BridgeTowerProcessor, + BridgeTowerTextConfig, + BridgeTowerVisionConfig, + ) from .models.byt5 import ByT5Tokenizer from .models.camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig from .models.canine import CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP, CanineConfig, CanineTokenizer @@ -4137,6 +4161,7 @@ if TYPE_CHECKING: from .models.beit import BeitFeatureExtractor, BeitImageProcessor from .models.bit import BitImageProcessor from .models.blip import BlipImageProcessor + from .models.bridgetower import BridgeTowerImageProcessor from .models.chinese_clip import ChineseCLIPFeatureExtractor, ChineseCLIPImageProcessor from .models.clip import CLIPFeatureExtractor, CLIPImageProcessor from .models.conditional_detr import ConditionalDetrFeatureExtractor, ConditionalDetrImageProcessor @@ -4456,6 +4481,13 @@ if TYPE_CHECKING: BloomModel, BloomPreTrainedModel, ) + from .models.bridgetower import ( + BRIDGETOWER_PRETRAINED_MODEL_ARCHIVE_LIST, + BridgeTowerForImageAndTextRetrieval, + BridgeTowerForMaskedLM, + BridgeTowerModel, + BridgeTowerPreTrainedModel, + ) from .models.camembert import ( CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST, CamembertForCausalLM, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 8ebc8e9f47..9eade475fa 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -38,6 +38,7 @@ from . import ( blip, bloom, bort, + bridgetower, byt5, camembert, canine, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 6eb3947afa..1a77eb0153 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -44,6 +44,7 @@ CONFIG_MAPPING_NAMES = OrderedDict( ("blenderbot-small", "BlenderbotSmallConfig"), ("blip", "BlipConfig"), ("bloom", "BloomConfig"), + ("bridgetower", "BridgeTowerConfig"), ("camembert", "CamembertConfig"), ("canine", "CanineConfig"), ("chinese_clip", "ChineseCLIPConfig"), @@ -210,6 +211,7 @@ CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict( ("blenderbot-small", "BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("blip", "BLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("bloom", "BLOOM_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("bridgetower", "BRIDGETOWER_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("camembert", "CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("canine", "CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("chinese_clip", "CHINESE_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -365,6 +367,7 @@ MODEL_NAMES_MAPPING = OrderedDict( ("blip", "BLIP"), ("bloom", "BLOOM"), ("bort", "BORT"), + ("bridgetower", "BridgeTower"), ("byt5", "ByT5"), ("camembert", "CamemBERT"), ("canine", "CANINE"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 03755cc658..8f61c7c6ee 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -40,6 +40,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict( ("beit", "BeitImageProcessor"), ("bit", "BitImageProcessor"), ("blip", "BlipImageProcessor"), + ("bridgetower", "BridgeTowerImageProcessor"), ("chinese_clip", "ChineseCLIPImageProcessor"), ("clip", "CLIPImageProcessor"), ("clipseg", "ViTImageProcessor"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index ce3de79bd4..2fe18122aa 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -43,6 +43,7 @@ MODEL_MAPPING_NAMES = OrderedDict( ("blenderbot-small", "BlenderbotSmallModel"), ("blip", "BlipModel"), ("bloom", "BloomModel"), + ("bridgetower", "BridgeTowerModel"), ("camembert", "CamembertModel"), ("canine", "CanineModel"), ("chinese_clip", "ChineseCLIPModel"), diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index 865d4bec92..e55e76aff6 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -43,6 +43,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict( [ ("altclip", "AltCLIPProcessor"), ("blip", "BLIPProcessor"), + ("bridgetower", "BridgeTowerProcessor"), ("chinese_clip", "ChineseCLIPProcessor"), ("clip", "CLIPProcessor"), ("clipseg", "CLIPSegProcessor"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 94da66961c..7073221d74 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -79,6 +79,7 @@ else: ("blenderbot-small", ("BlenderbotSmallTokenizer", None)), ("blip", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("bloom", (None, "BloomTokenizerFast" if is_tokenizers_available() else None)), + ("bridgetower", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)), ("byt5", ("ByT5Tokenizer", None)), ( "camembert", diff --git a/src/transformers/models/bridgetower/__init__.py b/src/transformers/models/bridgetower/__init__.py new file mode 100644 index 0000000000..363d996c12 --- /dev/null +++ b/src/transformers/models/bridgetower/__init__.py @@ -0,0 +1,92 @@ +# flake8: noqa +# There's no way to ignore "F401 '...' imported but unused" warnings in this +# module, but to preserve other warnings. So, don't check this module at all. + +# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +# rely on isort to merge the imports +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available + + +_import_structure = { + "configuration_bridgetower": [ + "BRIDGETOWER_PRETRAINED_CONFIG_ARCHIVE_MAP", + "BridgeTowerConfig", + "BridgeTowerTextConfig", + "BridgeTowerVisionConfig", + ], + "processing_bridgetower": ["BridgeTowerProcessor"], +} + +try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["image_processing_bridgetower"] = ["BridgeTowerImageProcessor"] + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_bridgetower"] = [ + "BRIDGETOWER_PRETRAINED_MODEL_ARCHIVE_LIST", + "BridgeTowerForImageAndTextRetrieval", + "BridgeTowerForMaskedLM", + "BridgeTowerModel", + "BridgeTowerPreTrainedModel", + ] + + +if TYPE_CHECKING: + from .configuration_bridgetower import ( + BRIDGETOWER_PRETRAINED_CONFIG_ARCHIVE_MAP, + BridgeTowerConfig, + BridgeTowerTextConfig, + BridgeTowerVisionConfig, + ) + from .processing_bridgetower import BridgeTowerProcessor + + try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .image_processing_bridgetower import BridgeTowerImageProcessor + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_bridgetower import ( + BRIDGETOWER_PRETRAINED_MODEL_ARCHIVE_LIST, + BridgeTowerForImageAndTextRetrieval, + BridgeTowerForMaskedLM, + BridgeTowerModel, + BridgeTowerPreTrainedModel, + ) + + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure) diff --git a/src/transformers/models/bridgetower/configuration_bridgetower.py b/src/transformers/models/bridgetower/configuration_bridgetower.py new file mode 100644 index 0000000000..bd1457719d --- /dev/null +++ b/src/transformers/models/bridgetower/configuration_bridgetower.py @@ -0,0 +1,387 @@ +# coding=utf-8 +# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License=, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing=, software +# distributed under the License is distributed on an "AS IS" BASIS=, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND=, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" BridgeTower model configuration""" + +import copy +import os +from typing import Union + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + +BRIDGETOWER_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "BridgeTower/bridgetower-base": "https://huggingface.co/BridgeTower/bridgetower-base/blob/main/config.json", + "BridgeTower/bridgetower-base-itm-mlm": ( + "https://huggingface.co/BridgeTower/bridgetower-base-itm-mlm/blob/main/config.json" + ), +} + + +class BridgeTowerVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the vision configuration of a [`BridgeTowerModel`]. Instantiating a + configuration with the defaults will yield a similar configuration to that of the bridgetower-base + [BridgeTower/bridgetower-base](https://huggingface.co/BridgeTower/bridgetower-base/) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in visual encoder model. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + image_size (`int`, *optional*, defaults to 288): + The size (resolution) of each image. + initializer_factor (`float``, *optional*, defaults to 1): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + stop_gradient (`bool`, *optional*, defaults to `False`): + Whether to stop gradient for training. + share_layernorm (`bool`, *optional*, defaults to `True`): + Whether LayerNorm layers are shared. + remove_last_layer (`bool`, *optional*, defaults to `False`): + Whether to remove the last layer from the vision encoder. + + + Example: + + ```python + >>> from transformers import BridgeTowerVisionConfig + + >>> # Initializing a BridgeTower BridgeTower/bridgetower-base style configuration for the vision model + >>> configuration = BridgeTowerVisionConfig() + + >>> # Accessing the configuration + >>> configuration + ```""" + model_type = "bridgetower_vision_model" + + def __init__( + self, + hidden_size=768, + num_hidden_layers=12, + num_channels=3, + patch_size=16, + image_size=288, + initializer_factor=1, + layer_norm_eps=1e-05, + stop_gradient=False, + share_layernorm=True, + remove_last_layer=False, + **kwargs + ): + super().__init__(**kwargs) + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.initializer_factor = initializer_factor + self.layer_norm_eps = layer_norm_eps + self.stop_gradient = stop_gradient + self.share_layernorm = share_layernorm + self.remove_last_layer = remove_last_layer + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + if config_dict.get("model_type") == "bridgetower": + config_dict = config_dict["text_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +class BridgeTowerTextConfig(PretrainedConfig): + r""" + This is the configuration class to store the text configuration of a [`BridgeTowerModel`]. The default values here + are copied from RoBERTa. Instantiating a configuration with the defaults will yield a similar configuration to that + of the bridgetower-base [BridegTower/bridgetower-base](https://huggingface.co/BridgeTower/bridgetower-base/) + architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 50265): + Vocabulary size of the text part of the model. Defines the number of different tokens that can be + represented by the `inputs_ids` passed when calling [`BridgeTowerModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 514): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids`. + initializer_factor (`float``, *optional*, defaults to 1): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). + is_decoder (`bool`, *optional*, defaults to `False`): + Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + classifier_dropout (`float`, *optional*): + The dropout ratio for the classification head. + + Example: + + ```python + >>> from transformers import BridgeTowerTextConfig + + >>> # Initializing a BridgeTower BridgeTower/bridgetower-base style configuration for the text model + >>> configuration = BridgeTowerTextConfig() + + >>> # Accessing the configuration + >>> configuration + ```""" + model_type = "bridgetower_text_model" + + def __init__( + self, + vocab_size=50265, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + initializer_factor=1, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=514, + type_vocab_size=1, + initializer_range=0.02, + layer_norm_eps=1e-05, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + position_embedding_type="absolute", + use_cache=True, + classifier_dropout=None, + **kwargs + ): + super().__init__(**kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.initializer_factor = initializer_factor + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.use_cache = use_cache + self.classifier_dropout = classifier_dropout + self.pad_token_id = pad_token_id + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + if config_dict.get("model_type") == "bridgetower": + config_dict = config_dict["text_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +class BridgeTowerConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BridgeTowerModel`]. It is used to instantiate a + BridgeTower model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the bridgetower-base + [BridgeTower/bridgetower-base](https://huggingface.co/BridgeTower/bridgetower-base/) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + share_cross_modal_transformer_layers (`bool`, *optional*, defaults to `True`): + Whether cross modal transformer layers are shared. + drop_rate (`float`, *optional*, defaults to 0.1): + Drop out probability. + head_hidden_scale (`int`, *optional*, defaults to 2): + Scale of hidden layers head. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + initializer_factor (`float``, *optional*, defaults to 1): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + is_encoder_decoder (`bool`, *optional*, defaults to `False`): + Whether this is an encoder/decoder model + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + share_link_tower_layers (`bool`, *optional*, defaults to `False`): + Whether the bride/link tower layers are shared. + link_tower_type (`str`, *optional*, defaults to `"add"`): + Type of the bridge/link layer. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 6): + Number of hidden layers in the Transformer encoder. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie input and output embeddings. + init_layernorm_from_vision_encoder (`bool`, *optional*, defaults to `False`): + Whether to init LayerNorm from the vision encoder. + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`BridgeTowerTextConfig`]. + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`BridgeTowerVisionConfig`]. + + Example: + + ```python + >>> from transformers import BridgeTowerModel, BridgeTowerConfig + + >>> # Initializing a BridgeTower BridgeTower/bridgetower-base style configuration + >>> configuration = BridgeTowerConfig() + + >>> # Initializing a model from the BridgeTower/bridgetower-base style configuration + >>> model = BridgeTowerModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "bridgetower" + + def __init__( + self, + share_cross_modal_transformer_layers=True, + drop_rate=0.1, + head_hidden_scale=2, + hidden_act="gelu", + hidden_size=768, + initializer_factor=1, + is_encoder_decoder=False, + layer_norm_eps=1e-05, + share_link_tower_layers=False, + link_tower_type="add", + num_attention_heads=12, + num_hidden_layers=6, + tie_word_embeddings=False, + init_layernorm_from_vision_encoder=False, + text_config=None, + vision_config=None, + **kwargs + ): + super().__init__(**kwargs) + self.share_cross_modal_transformer_layers = share_cross_modal_transformer_layers + self.drop_rate = drop_rate + self.head_hidden_scale = head_hidden_scale + self.hidden_act = hidden_act + self.hidden_size = hidden_size + self.initializer_factor = initializer_factor + self.is_encoder_decoder = is_encoder_decoder + self.layer_norm_eps = layer_norm_eps + self.share_link_tower_layers = share_link_tower_layers + self.link_tower_type = link_tower_type + self.num_attention_heads = num_attention_heads + self.num_hidden_layers = num_hidden_layers + self.tie_word_embeddings = tie_word_embeddings + self.init_layernorm_from_vision_encoder = init_layernorm_from_vision_encoder + + text_config_dict = kwargs.pop("text_config_dict", None) + vision_config_dict = kwargs.pop("vision_config_dict", None) + if text_config_dict is not None: + text_config = text_config_dict + if vision_config_dict is not None: + vision_config = vision_config_dict + + if text_config is None: + text_config = {} + logger.info("text_config is None. Initializing the BridgeTowerTextConfig with default values.") + + if vision_config is None: + vision_config = {} + logger.info("vision_config is None. Initializing the BridgeTowerVisionConfig with default values.") + + self.text_config = BridgeTowerTextConfig(**text_config) + self.vision_config = BridgeTowerVisionConfig(**vision_config) + + @classmethod + def from_text_vision_configs( + cls, text_config: BridgeTowerTextConfig, vision_config: BridgeTowerVisionConfig, **kwargs + ): + r""" + Instantiate a [`BridgeTowerConfig`] (or a derived class) from BridgeTower text model configuration. Returns: + [`BridgeTowerConfig`]: An instance of a configuration object + """ + + return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs) + + def to_dict(self): + """ + Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. + + Returns: + `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, + """ + output = copy.deepcopy(self.__dict__) + output["text_config"] = self.text_config.to_dict() + output["vision_config"] = self.vision_config.to_dict() + output["model_type"] = self.__class__.model_type + return output diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py new file mode 100644 index 0000000000..8e8be8373c --- /dev/null +++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py @@ -0,0 +1,511 @@ +# coding=utf-8 +# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for BridgeTower.""" + +import warnings +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union + +import numpy as np + +from transformers.utils import is_vision_available +from transformers.utils.generic import TensorType + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import PaddingMode, center_crop, normalize, pad, rescale, resize, to_channel_dimension_format +from ...image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_batched, + to_numpy_array, + valid_images, +) +from ...utils import logging + + +if is_vision_available(): + import PIL + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.vilt.image_processing_vilt.max_across_indices +def max_across_indices(values: Iterable[Any]) -> List[Any]: + """ + Return the maximum value across all indices of an iterable of values. + """ + return [max(values_i) for values_i in zip(*values)] + + +# Copied from transformers.models.vilt.image_processing_vilt.make_pixel_mask +def make_pixel_mask(image: np.ndarray, output_size: Tuple[int, int]) -> np.ndarray: + """ + Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. + + Args: + image (`np.ndarray`): + Image to make the pixel mask for. + output_size (`Tuple[int, int]`): + Output size of the mask. + """ + input_height, input_width = get_image_size(image) + mask = np.zeros(output_size, dtype=np.int64) + mask[:input_height, :input_width] = 1 + return mask + + +# Copied from transformers.models.vilt.image_processing_vilt.get_max_height_width +def get_max_height_width(images: List[np.ndarray]) -> List[int]: + """ + Get the maximum height and width across all images in a batch. + """ + input_channel_dimension = infer_channel_dimension_format(images[0]) + + if input_channel_dimension == ChannelDimension.FIRST: + _, max_height, max_width = max_across_indices([img.shape for img in images]) + elif input_channel_dimension == ChannelDimension.LAST: + max_height, max_width, _ = max_across_indices([img.shape for img in images]) + else: + raise ValueError(f"Invalid channel dimension format: {input_channel_dimension}") + return (max_height, max_width) + + +# Copied from transformers.models.vilt.image_processing_vilt.get_resize_output_image_size +def get_resize_output_image_size( + input_image: np.ndarray, shorter: int = 800, longer: int = 1333, size_divisor: int = 32 +) -> Tuple[int, int]: + input_height, input_width = get_image_size(input_image) + min_size, max_size = shorter, longer + + scale = min_size / min(input_height, input_width) + + if input_height < input_width: + new_height = min_size + new_width = scale * input_width + else: + new_height = scale * input_height + new_width = min_size + + if max(new_height, new_width) > max_size: + scale = max_size / max(new_height, new_width) + new_height = scale * new_height + new_width = scale * new_width + + new_height, new_width = int(new_height + 0.5), int(new_width + 0.5) + new_height = new_height // size_divisor * size_divisor + new_width = new_width // size_divisor * size_divisor + + return new_height, new_width + + +class BridgeTowerImageProcessor(BaseImageProcessor): + r""" + Constructs a BridgeTower image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the + `do_resize` parameter in the `preprocess` method. + size (`Dict[str, int]` *optional*, defaults to `288`): + Resize the shorter side of the input to `size["shortest_edge"]`. The longer side will be limited to under + `int((1333 / 800) * size["shortest_edge"])` while preserving the aspect ratio. Only has an effect if + `do_resize` is set to `True`. Can be overridden by the `size` parameter in the `preprocess` method. + size_divisor (`int`, *optional*, defaults to `32`): + The size by which to make sure both the height and width can be divided. Only has an effect if `do_resize` + is set to `True`. Can be overridden by the `size_divisor` parameter in the `preprocess` method. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `resample` parameter in the `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` + parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be + overridden by the `rescale_factor` parameter in the `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. Can be overridden by the `do_normalize` parameter in the `preprocess` method. + image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be + overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_center_crop (`bool`, *optional*, defaults to `True`): + Whether to center crop the image. Can be overridden by the `do_center_crop` parameter in the `preprocess` + method. + do_pad (`bool`, *optional*, defaults to `True`): + Whether to pad the image to the `(max_height, max_width)` of the images in the batch. Can be overridden by + the `do_pad` parameter in the `preprocess` method. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Dict[str, int] = 288, + size_divisor: int = 32, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_center_crop: bool = True, + do_pad: bool = True, + **kwargs + ) -> None: + if "pad_and_return_pixel_mask" in kwargs: + do_pad = kwargs.pop("pad_and_return_pixel_mask") + + super().__init__(**kwargs) + size = size if size is not None else {"shortest_edge": 288} + size = get_size_dict(size, default_to_square=False) + + self.do_resize = do_resize + self.size = size + self.size_divisor = size_divisor + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else [0.48145466, 0.4578275, 0.40821073] + self.image_std = image_std if image_std is not None else [0.26862954, 0.26130258, 0.27577711] + self.do_pad = do_pad + self.do_center_crop = do_center_crop + + # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.resize + def resize( + self, + image: np.ndarray, + size: Dict[str, int], + size_divisor: int = 32, + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs + ) -> np.ndarray: + """ + Resize an image. + + Resizes the shorter side of the image to `size["shortest_edge"]` while preserving the aspect ratio. If the + longer side is larger than the max size `(int(`size["shortest_edge"]` * 1333 / 800))`, the longer side is then + resized to the max size while preserving the aspect ratio. + + Args: + image (`np.ndarray`): + Image to resize. + size (`Dict[str, int]`): + Controls the size of the output image. Should be of the form `{"shortest_edge": int}`. + size_divisor (`int`, defaults to 32): + The image is resized to a size that is a multiple of this value. + resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use when resiizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + """ + size = get_size_dict(size, default_to_square=False) + if "shortest_edge" not in size: + raise ValueError(f"The `size` dictionary must contain the key `shortest_edge`. Got {size.keys()}") + shorter = size["shortest_edge"] + longer = int(1333 / 800 * shorter) + output_size = get_resize_output_image_size(image, shorter=shorter, longer=longer, size_divisor=size_divisor) + return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs) + + # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.rescale + def rescale( + self, + image: np.ndarray, + scale: Union[int, float], + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs + ): + """ + Rescale an image by a scale factor. image = image * scale. + + Args: + image (`np.ndarray`): + Image to rescale. + scale (`int` or `float`): + Scale to apply to the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + """ + return rescale(image, scale=scale, data_format=data_format, **kwargs) + + def center_crop( + self, + image: np.ndarray, + size: Dict[str, int], + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs + ) -> np.ndarray: + """ + Center crop an image to (size["height"], size["width"]). If the input size is smaller than `size` along any + edge, the image is padded with 0's and then center cropped. + + Args: + image (`np.ndarray`): + Image to center crop. + size (`Dict[str, int]`): + Size of the output image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + """ + output_size = size["shortest_edge"] + return center_crop(image, size=(output_size, output_size), data_format=data_format, **kwargs) + + # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.normalize + def normalize( + self, + image: np.ndarray, + mean: Union[float, List[float]], + std: Union[float, List[float]], + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs + ) -> np.ndarray: + """ + Normalize an image. image = (image - image_mean) / image_std. + + Args: + image (`np.ndarray`): + Image to normalize. + mean (`float` or `List[float]`): + Image mean. + std (`float` or `List[float]`): + Image standard deviation. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + """ + return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs) + + def _pad_image( + self, + image: np.ndarray, + output_size: Tuple[int, int], + constant_values: Union[float, Iterable[float]] = 0, + data_format: Optional[ChannelDimension] = None, + ) -> np.ndarray: + """ + Pad an image with zeros to the given size. + """ + input_height, input_width = get_image_size(image) + output_height, output_width = output_size + + pad_bottom = output_height - input_height + pad_right = output_width - input_width + padding = ((0, pad_bottom), (0, pad_right)) + padded_image = pad( + image, padding, mode=PaddingMode.CONSTANT, constant_values=constant_values, data_format=data_format + ) + return padded_image + + def pad( + self, + images: List[np.ndarray], + return_pixel_mask: bool = True, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = None, + ) -> BatchFeature: + """ + Pads a batch of images with zeros to the size of largest height and width in the batch and optionally returns + their corresponding pixel mask. + + Args: + images (`List[np.ndarray]`): + Batch of images to pad. + return_pixel_mask (`bool`, *optional*, defaults to `False`): + Whether to return the pixel mask. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + """ + pad_size = get_max_height_width(images) + padded_images = [ + self._pad_image(image=image, output_size=pad_size, data_format=data_format) for image in images + ] + data = {"pixel_values": padded_images} + if return_pixel_mask: + masks = [make_pixel_mask(image=image, output_size=pad_size) for image in images] + data["pixel_mask"] = masks + + return BatchFeature(data=data, tensor_type=return_tensors) + + # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad_and_create_pixel_mask + def pad_and_create_pixel_mask( + self, + pixel_values_list: List[ImageInput], + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = None, + ) -> BatchFeature: + """ + Pads a batch of images with zeros to the size of largest height and width in the batch and returns their + corresponding pixel mask. + + Args: + images (`List[np.ndarray]`): + Batch of images to pad. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + """ + warnings.warn( + "This method is deprecated and will be removed in v4.26.0. Please use pad instead.", FutureWarning + ) + # pad expects a list of np.ndarray, but the previous feature extractors expected torch tensors + images = [to_numpy_array(image) for image in pixel_values_list] + return self.pad( + images=images, + return_pixel_mask=True, + return_tensors=return_tensors, + data_format=data_format, + ) + + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + size_divisor: Optional[int] = None, + resample: PILImageResampling = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_pad: Optional[bool] = None, + do_center_crop: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + **kwargs, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Controls the size of the image after `resize`. The shortest edge of the image is resized to + `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image + is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest + edge equal to `int(size["shortest_edge"] * (1333 / 800))`. + size_divisor (`int`, *optional*, defaults to `self.size_divisor`): + The image is resized to a size that is a multiple of this value. + resample (`PILImageResampling`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to normalize the image by if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to normalize the image by if `do_normalize` is set to `True`. + do_pad (`bool`, *optional*, defaults to `self.do_pad`): + Whether to pad the image to the (max_height, max_width) in the batch. If `True`, a pixel mask is also + created and returned. + do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`): + Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the + image is padded with 0's and then center cropped. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `ChannelDimension.LAST`: image in (height, width, num_channels) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + size_divisor = size_divisor if size_divisor is not None else self.size_divisor + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_pad = do_pad if do_pad is not None else self.do_pad + do_center_crop if do_center_crop is not None else self.do_center_crop + + size = size if size is not None else self.size + size = get_size_dict(size, default_to_square=False) + + if not is_batched(images): + images = [images] + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + if do_resize and size is None or resample is None: + raise ValueError("Size and resample must be specified if do_resize is True.") + + if do_rescale and rescale_factor is None: + raise ValueError("Rescale factor must be specified if do_rescale is True.") + + if do_normalize and (image_mean is None or image_std is None): + raise ValueError("Image mean and std must be specified if do_normalize is True.") + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_resize: + images = [ + self.resize(image=image, size=size, size_divisor=size_divisor, resample=resample) for image in images + ] + + if do_center_crop: + images = [self.center_crop(image=image, size=size) for image in images] + + if do_rescale: + images = [self.rescale(image=image, scale=rescale_factor) for image in images] + + if do_normalize: + images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images] + + images = [to_channel_dimension_format(image, data_format) for image in images] + + if do_pad: + encoded_outputs = self.pad(images, return_pixel_mask=True, return_tensors=return_tensors) + else: + encoded_outputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) + + return encoded_outputs diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py new file mode 100644 index 0000000000..f520a221f8 --- /dev/null +++ b/src/transformers/models/bridgetower/modeling_bridgetower.py @@ -0,0 +1,1685 @@ +# coding=utf-8 +# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BridgeTower Model""" + +import math +from collections import OrderedDict +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn + +from ...activations import ACT2FN, QuickGELUActivation +from ...modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + MaskedLMOutput, + ModelOutput, + SequenceClassifierOutput, +) +from ...modeling_utils import PreTrainedModel, apply_chunking_to_forward +from ...pytorch_utils import find_pruneable_heads_and_indices, is_torch_greater_or_equal_than_1_10, prune_linear_layer +from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings +from .configuration_bridgetower import BridgeTowerConfig, BridgeTowerTextConfig, BridgeTowerVisionConfig + + +logger = logging.get_logger(__name__) + +if not is_torch_greater_or_equal_than_1_10: + logger.warning( + f"You are using torch=={torch.__version__}, but torch>=1.10.0 is required to use " + "BridgeTowerModel. Please upgrade torch." + ) + +_CONFIG_FOR_DOC = "BridgeTowerConfig" +_CHECKPOINT_FOR_DOC = "BridgeTower/bridgetower-base" +_TOKENIZER_FOR_DOC = "RobertaTokenizer" + +BRIDGETOWER_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "BridgeTower/bridgetower-base", + "BridgeTower/bridgetower-base-itm-mlm" + # See all bridgetower models at https://huggingface.co/BridgeTower +] + + +BRIDGETOWER_START_DOCSTRING = r""" + This model is a PyTorch `torch.nn.Module `_ subclass. Use + it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`BridgeTowerConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +BRIDGETOWER_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`BertTokenizer`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input + IDs?](../glossary#input-ids) + + attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + [What are attention masks?](../glossary#attention-mask) + + token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + [What are token type IDs?](../glossary#token-type-ids) + + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`BridgeTowerImageProcessor`]. See + [`BridgeTowerImageProcessor.__call__`] for details. + + pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): + Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`: + + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + `What are attention masks? <../glossary.html#attention-mask>`__ + + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + + image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*): + Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `pixel_values` into patch embeddings. + + image_token_type_idx (`int`, *optional*): + - The token type ids for images. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@dataclass +class BridgeTowerModelOutput(ModelOutput): + """ + Output type of [`BridgeTowerModel`]. + + Args: + text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_size)`): + Sequence of hidden-states at the text output of the last layer of the model. + image_features (`torch.FloatTensor` of shape `(batch_size, image_sequence_length, hidden_size)`): + Sequence of hidden-states at the image output of the last layer of the model. + pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size x 2)`): + Concatenation of last layer hidden-state of the first token of the text and image sequence (classification + token), respectively, after further processing through layers used for auxiliary pretraining tasks. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + text_features: torch.FloatTensor = None + image_features: torch.FloatTensor = None + pooler_output: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +class BridgeTowerResidualAttention(nn.Module): + def __init__(self, config): + super().__init__() + + self.attn = nn.MultiheadAttention(config.hidden_size, config.hidden_size // 64) + self.ln_1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.mlp = nn.ModuleDict( + OrderedDict( + [ + ("c_fc", nn.Linear(config.hidden_size, config.hidden_size * 4)), + ("gelu", QuickGELUActivation()), + ("c_proj", nn.Linear(config.hidden_size * 4, config.hidden_size)), + ] + ) + ) + self.ln_2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.attn_mask = None + + def attention(self, hidden_state: torch.Tensor, attention_mask: torch.Tensor): + if attention_mask is not None: + attention_mask = attention_mask.to(dtype=torch.bool, device=hidden_state.device) + self.attn_mask = ( + self.attn_mask.to(dtype=hidden_state.dtype, device=hidden_state.device) + if self.attn_mask is not None + else None + ) + return self.attn( + hidden_state, + hidden_state, + hidden_state, + need_weights=False, + attn_mask=self.attn_mask, + key_padding_mask=attention_mask, + )[0] + + def forward(self, hidden_state: torch.Tensor, attention_mask: torch.Tensor = None): + residual_state = hidden_state + self.attention(self.ln_1(hidden_state), attention_mask) + hidden_state = self.ln_2(residual_state) + for _, layer in self.mlp.items(): + hidden_state = layer(hidden_state) + hidden_state = residual_state + hidden_state + return hidden_state + + +class BridgeTowerTransformer(nn.Module): + def __init__(self, config): + super().__init__() + self.hidden_size = config.hidden_size + self.num_hidden_layers = config.num_hidden_layers + if config.remove_last_layer: + self.resblocks = nn.ModuleList( + [BridgeTowerResidualAttention(config) for _ in range(self.num_hidden_layers - 1)] + ) + else: + self.resblocks = nn.ModuleList( + [BridgeTowerResidualAttention(config) for _ in range(self.num_hidden_layers)] + ) + self.stop_gradient = config.stop_gradient + + def forward(self, hidden_state: torch.Tensor, attention_mask: Optional[torch.Tensor] = None): + hidden_states = [] + for block in self.resblocks: + hidden_state = block(hidden_state, attention_mask) + if self.stop_gradient: + hidden_states.append(hidden_state.detach()) + else: + hidden_states.append(hidden_state) + return hidden_states + + +# Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->BridgeTower +class BridgeTowerVisionEmbeddings(nn.Module): + def __init__(self, config: BridgeTowerVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter(torch.randn(self.embed_dim)) + + self.patch_embedding = nn.Conv2d( + in_channels=config.num_channels, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + bias=False, + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1))) + + def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: + batch_size = pixel_values.shape[0] + patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + + class_embeds = self.class_embedding.expand(batch_size, 1, -1) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + embeddings = embeddings + self.position_embedding(self.position_ids) + return embeddings + + +class BridgeTowerVisionTransformer(nn.Module): + def __init__(self, config): + super().__init__() + + self.embeddings = BridgeTowerVisionEmbeddings(config) + self.ln_pre = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.transformer = BridgeTowerTransformer(config) + self.ln_post = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.share_layernorm = config.share_layernorm + if not config.share_layernorm: + self.ln_separate = nn.ModuleList( + [nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) for _ in range(config.num_hidden_layers)] + ) + + def forward(self, pixel_values: torch.Tensor, attention_mask): + hidden_states = self.embeddings(pixel_values) + hidden_states = self.ln_pre(hidden_states) + # NLD -> LND + hidden_states = hidden_states.permute(1, 0, 2) + + hidden_states = self.transformer(hidden_states, attention_mask) + # shape = [num_hidden_layers, hidden_size, *, grid ** 2] + hidden_states = torch.stack(hidden_states, dim=0) + # shape = [num_hidden_layers, *, hidden_size, grid ** 2] + hidden_states = hidden_states.permute(0, 2, 1, 3) + if self.share_layernorm: + hidden_states = self.ln_post(hidden_states) + else: + hidden_states_stack = [] + for hidden_states, ln in zip(hidden_states, self.ln_separate): + hidden_states = ln(hidden_states) + hidden_states_stack.append(hidden_states) + # shape = [num_hidden_layers, *, hidden_size, grid ** 2] + hidden_states = torch.stack(hidden_states_stack, dim=0) + return hidden_states + + def forward_pre(self, pixel_values: torch.Tensor): + hidden_states = self.embeddings(pixel_values) + hidden_states = self.ln_pre(hidden_states) + # NLD -> LND + hidden_states = hidden_states.permute(1, 0, 2) + return hidden_states + + def forward_post(self, hidden_state: torch.Tensor): + visual_output_post = hidden_state.permute(1, 0, 2) + visual_output_post = self.ln_post(visual_output_post) + return visual_output_post + + +class BridgeTowerLinkTower(nn.Module): + def __init__(self, config): + super().__init__() + self.link_tower_type = config.link_tower_type + self.hidden_size = config.hidden_size + if config.link_tower_type in ["add", "scaled_add", "interpolate"]: + if config.link_tower_type == "scaled_add": + self.scaled_factor = nn.Parameter(torch.tensor(1.0)) + elif config.link_tower_type == "interpolate": + self.beta = nn.Parameter(torch.tensor(0.5)) + self.LayerNorm = nn.LayerNorm(self.hidden_size, eps=config.layer_norm_eps) + else: + raise NotImplementedError(f"link_tower_type {config.link_tower_type} is not implemented") + + def forward(self, hidden_states, cross_modal_hidden_states, attention_mask): + if self.link_tower_type == "add": + return self.LayerNorm(hidden_states + cross_modal_hidden_states) + elif self.link_tower_type == "scaled_add": + return self.LayerNorm(hidden_states * self.scaled_factor + cross_modal_hidden_states) + elif self.link_tower_type == "interpolate": + return self.LayerNorm(hidden_states * (1 - self.beta) + cross_modal_hidden_states * self.beta) + else: + raise NotImplementedError(f"link_tower_type {self.link_tower_type} is not implemented") + + +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->BridgeTower +class BridgeTowerSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->BridgeTower +class BridgeTowerIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->BridgeTower +class BridgeTowerOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->BridgeTower +class BridgeTowerPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfAttention with Roberta->BridgeTower +class BridgeTowerSelfAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + use_cache = past_key_value is not None + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + query_length, key_length = query_layer.shape[2], key_layer.shape[2] + if use_cache: + position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( + -1, 1 + ) + else: + position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BridgeTowerModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->BridgeTower +class BridgeTowerAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + self.self = BridgeTowerSelfAttention(config, position_embedding_type=position_embedding_type) + self.output = BridgeTowerSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class BridgeTowerBertCrossLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BridgeTowerAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + self.crossattention = BridgeTowerAttention(config) + self.intermediate = BridgeTowerIntermediate(config) + self.output = BridgeTowerOutput(config) + + def forward( + self, + hidden_states, + encoder_hidden_states, + attention_mask=None, + head_mask=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attention_outputs = self.attention( + hidden_states, + attention_mask=attention_mask, + head_mask=None, + output_attentions=output_attentions, + past_key_value=None, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + # add self attentions if we output attention weights + outputs = self_attention_outputs[1:] + + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=past_key_value, + output_attentions=output_attentions, + ) + attention_output = cross_attention_outputs[0] + # add cross attentions if we output attention weights + outputs = outputs + cross_attention_outputs[1:-1] + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class BridgeTowerTextLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BridgeTowerAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = BridgeTowerAttention(config, position_embedding_type="absolute") + self.intermediate = BridgeTowerIntermediate(config) + self.output = BridgeTowerOutput(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaEncoder with Roberta->BridgeTowerText +class BridgeTowerTextEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([BridgeTowerTextLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, past_key_value, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->BridgeTowerText +class BridgeTowerTextEmbeddings(nn.Module): + """ + Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. + """ + + # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__ + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False + ) + + # End copy + self.padding_idx = config.pad_token_id + self.position_embeddings = nn.Embedding( + config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx + ) + + def forward( + self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0 + ): + if position_ids is None: + if input_ids is not None: + # Create the position ids from the input token ids. Any padded tokens remain padded. + position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length) + else: + position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) + + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + def create_position_ids_from_inputs_embeds(self, inputs_embeds): + """ + We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. + + Args: + inputs_embeds: torch.Tensor + + Returns: torch.Tensor + """ + input_shape = inputs_embeds.size()[:-1] + sequence_length = input_shape[1] + + position_ids = torch.arange( + self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device + ) + return position_ids.unsqueeze(0).expand(input_shape) + + +# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids +def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0): + """ + Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols + are ignored. This is modified from fairseq's `utils.make_positions`. + + Args: + x: torch.Tensor x: + + Returns: torch.Tensor + """ + # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA. + mask = input_ids.ne(padding_idx).int() + incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask + return incremental_indices.long() + padding_idx + + +class BridgeTowerPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BridgeTowerConfig + base_model_prefix = "bridgetower" + supports_gradient_checkpointing = False + _no_split_modules = ["BridgeTowerSelfAttention"] + + def _init_weights(self, module): + if isinstance(module, BridgeTowerVisionModel): + proj_std = (module.visual.transformer.hidden_size**-0.5) * ( + (2 * module.visual.transformer.num_hidden_layers) ** -0.5 + ) + attn_std = module.visual.transformer.hidden_size**-0.5 + fc_std = (2 * module.visual.transformer.hidden_size) ** -0.5 + for block in module.visual.transformer.resblocks: + nn.init.normal_(block.attn.in_proj_weight, std=attn_std * self.config.initializer_factor) + nn.init.normal_(block.attn.out_proj.weight, std=proj_std * self.config.initializer_factor) + nn.init.normal_(block.mlp.c_fc.weight, std=fc_std * self.config.initializer_factor) + nn.init.normal_(block.mlp.c_proj.weight, std=proj_std * self.config.initializer_factor) + + nn.init.normal_(module.visual.embeddings.class_embedding, std=attn_std * self.config.initializer_factor) + nn.init.normal_( + module.visual.embeddings.position_embedding.weight, std=attn_std * self.config.initializer_factor + ) + elif isinstance(module, (nn.Linear, nn.Conv2d, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.05 * self.config.initializer_factor) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +class BridgeTowerVisionModel(BridgeTowerPreTrainedModel): + config_class = BridgeTowerVisionConfig + + def __init__(self, config): + super().__init__(config) + self.visual = BridgeTowerVisionTransformer(config) + + @property + def dtype(self): + return self.visual.embeddings.patch_embedding.weight.dtype + + def forward(self, image, image_mask=None): + return self.visual(image.type(self.dtype), image_mask) + + +class BridgeTowerTextModel(BridgeTowerPreTrainedModel): + """ + + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in *Attention is + all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz + Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set + to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and + `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. + + .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762 + + """ + + config_class = BridgeTowerTextConfig + + _keys_to_ignore_on_load_missing = [r"position_ids"] + + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.config = config + + self.embeddings = BridgeTowerTextEmbeddings(config) + self.encoder = BridgeTowerTextEncoder(config) + + self.pooler = BridgeTowerPooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + # Copied from transformers.models.roberta.modeling_roberta.RobertaModel.forward + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) + + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +@add_start_docstrings( + "The bare BridgeTower Model transformer outputting BridgeTowerModelOutput object without any specific head on" + " top.", + BRIDGETOWER_START_DOCSTRING, +) +class BridgeTowerModel(BridgeTowerPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.config = config + vision_config = config.vision_config + text_config = config.text_config + + if config.share_cross_modal_transformer_layers: + self.cross_modal_text_transform = nn.Linear(text_config.hidden_size, config.hidden_size) + self.cross_modal_image_transform = nn.Linear(vision_config.hidden_size, config.hidden_size) + else: + self.cross_modal_text_transform = nn.ModuleList( + [nn.Linear(text_config.hidden_size, config.hidden_size) for _ in range(config.num_hidden_layers)] + ) + self.cross_modal_image_transform = nn.ModuleList( + [nn.Linear(vision_config.hidden_size, config.hidden_size) for _ in range(config.num_hidden_layers)] + ) + + self.token_type_embeddings = nn.Embedding(2, config.hidden_size) + + self.vision_model = BridgeTowerVisionModel(vision_config) + + self.text_model = BridgeTowerTextModel(text_config) + + if not vision_config.share_layernorm and config.init_layernorm_from_vision_encoder: + for ln in self.vision_model.visual.cross_modal_ln_separate: + ln.weight.data = self.vision_model.visual.ln_post.weight.data + ln.bias.data = self.vision_model.visual.ln_post.bias.data + + self.cross_modal_image_layers = nn.ModuleList( + [BridgeTowerBertCrossLayer(text_config) for _ in range(config.num_hidden_layers)] + ) + self.cross_modal_text_layers = nn.ModuleList( + [BridgeTowerBertCrossLayer(text_config) for _ in range(config.num_hidden_layers)] + ) + + # Class token => Linear => Tanh + self.cross_modal_image_pooler = BridgeTowerPooler(config) + self.cross_modal_text_pooler = BridgeTowerPooler(config) + + # Initialize BridgeTower Components + self.cross_modal_text_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.cross_modal_image_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + if config.share_link_tower_layers: + self.cross_modal_text_link_tower = BridgeTowerLinkTower(config) + self.cross_modal_image_link_tower = BridgeTowerLinkTower(config) + else: + self.cross_modal_text_link_tower = nn.ModuleList( + [BridgeTowerLinkTower(config) for _ in range(config.num_hidden_layers - 1)] + ) + self.cross_modal_image_link_tower = nn.ModuleList( + [BridgeTowerLinkTower(config) for _ in range(config.num_hidden_layers - 1)] + ) + + self.post_init() + + @add_start_docstrings_to_model_forward(BRIDGETOWER_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BridgeTowerModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + pixel_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + image_embeds: Optional[torch.FloatTensor] = None, + image_token_type_idx: Optional[int] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + ) -> Union[Tuple[torch.Tensor], BridgeTowerModelOutput]: + r""" + output_hidden_states (`bool`, *optional*): + If set to `True`, hidden states are returned as a list containing the hidden states of text, image, and + cross-modal components respectively. i.e. `(hidden_states_text, hidden_states_image, + hidden_states_cross_modal)` where each element is a list of the hidden states of the corresponding + modality. `hidden_states_txt/img` are a list of tensors corresponding to unimodal hidden states and + `hidden_states_cross_modal` is a list of tuples containing `cross_modal_text_hidden_states` and + `cross_modal_image_hidden_states` of each brdige layer. + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels are currently not supported. + Returns: + + Examples: + + ```python + >>> from transformers import BridgeTowerProcessor, BridgeTowerModel + >>> from PIL import Image + >>> import requests + + >>> # prepare image and text + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "hello world" + >>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base") + >>> model = BridgeTowerModel.from_pretrained("BridgeTower/bridgetower-base") + + >>> inputs = processor(image, text, return_tensors="pt") + >>> outputs = model(**inputs) + >>> outputs.keys() + odict_keys(['text_features', 'image_features', 'pooler_output']) + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + all_hidden_states_text = () if output_hidden_states else None + all_hidden_states_image = () if output_hidden_states else None + all_hidden_states_cross = () if output_hidden_states else None + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + image_token_type_idx = image_token_type_idx if image_token_type_idx else 1 + input_shape = input_ids.size() + text_embeds = self.text_model.embeddings(input_ids=input_ids) + + if output_hidden_states: + all_hidden_states_text += (text_embeds,) + + if attention_mask is None: + attention_mask = torch.ones(input_shape, dtype=torch.long, device=input_ids.device) + extend_text_masks = self.text_model.get_extended_attention_mask(attention_mask, input_shape).to( + input_ids.device + ) + + # The split_index determines how many layers of the uni-modal encoder are applied before the cross-modal encoder + split_index = len(self.text_model.encoder.layer) - self.config.num_hidden_layers + 1 + + # Run the first 'split_index' layers of the textual encoder + for layer in self.text_model.encoder.layer[:split_index]: + text_embeds = layer(text_embeds, extend_text_masks)[0] + + if output_hidden_states: + all_hidden_states_text += (text_embeds,) + + image_embeds = self.vision_model.visual.forward_pre(pixel_values.type(self.vision_model.dtype)) + if output_hidden_states: + all_hidden_states_image += (image_embeds,) + + # Run the first 'split_index' layers of the visual encoder + for block in self.vision_model.visual.transformer.resblocks[:split_index]: + image_embeds = block(image_embeds) + if output_hidden_states: + all_hidden_states_image += (image_embeds,) + + image_embeds_with_ln = self.vision_model.visual.forward_post(image_embeds.type(self.vision_model.dtype)) + + # first layer is a special case because we don't have the output from the cross-encoder yet + cross_modal_text = self.cross_modal_text_transform(text_embeds) + + text_token_type_embeddings = self.token_type_embeddings( + torch.zeros(1, dtype=torch.long, device=input_ids.device) + ).expand_as(cross_modal_text) + + cross_modal_text = self.cross_modal_text_layernorm(cross_modal_text + text_token_type_embeddings) + + image_embeds_with_ln = self.cross_modal_image_transform(image_embeds_with_ln) + image_token_type_embeddings = self.token_type_embeddings( + torch.full((1,), image_token_type_idx, dtype=torch.long, device=input_ids.device) + ).expand_as(image_embeds_with_ln) + + image_embeds_with_ln = image_embeds_with_ln + image_token_type_embeddings + cross_modal_image = self.cross_modal_image_layernorm(image_embeds_with_ln) + + pixel_mask = torch.ones( + (cross_modal_image.size(0), cross_modal_image.size(1)), + dtype=torch.long, + device=input_ids.device, + ) + extend_image_masks = self.text_model.get_extended_attention_mask(pixel_mask, pixel_mask.size()).to( + input_ids.device + ) + + layer_outputs_text = self.cross_modal_text_layers[0]( + cross_modal_text, + cross_modal_image, + attention_mask=extend_text_masks, + encoder_attention_mask=extend_image_masks, + output_attentions=output_attentions, + ) + cross_text_features = layer_outputs_text[0] + + layer_outputs_image = self.cross_modal_image_layers[0]( + cross_modal_image, + cross_modal_text, + attention_mask=extend_image_masks, + encoder_attention_mask=extend_text_masks, + output_attentions=output_attentions, + ) + cross_image_features = layer_outputs_image[0] + + if output_hidden_states: + all_hidden_states_cross += ((cross_text_features, cross_image_features),) + + if output_attentions: + all_self_attentions += ((layer_outputs_text[1], layer_outputs_image[1]),) + + link_layer_index = 0 + + # Each of the top 6 layers of the visual and textual encoders ([split_index:]) is connected to each layer of + # the cross-modal encoder via bridge layers, which brings bottom-up alignment and fusion to the cross-modal encoder. + for i in range(split_index, len(self.text_model.encoder.layer)): + text_embeds = self.text_model.encoder.layer[i](text_embeds, extend_text_masks)[0] + image_embeds = self.vision_model.visual.transformer.resblocks[i](image_embeds).type( + self.vision_model.dtype + ) + image_embeds_with_ln = ( + self.cross_modal_image_transform(self.vision_model.visual.forward_post(image_embeds)) + + image_token_type_embeddings + ) + + text_link_tower = self.cross_modal_text_link_tower[link_layer_index] + image_link_tower = self.cross_modal_image_link_tower[link_layer_index] + + # Bridge layers for textual and visual encoders + cross_text_features_ = text_link_tower( + self.cross_modal_text_transform(text_embeds) + text_token_type_embeddings, + cross_text_features, + extend_text_masks, + ) + cross_image_features_ = image_link_tower(image_embeds_with_ln, cross_image_features, extend_image_masks) + + # Cross-modal encoder via bridge layers of textual and visual encoders + layer_outputs_text = self.cross_modal_text_layers[link_layer_index + 1]( + cross_text_features_, + cross_image_features_, + attention_mask=extend_text_masks, + encoder_attention_mask=extend_image_masks, + output_attentions=output_attentions, + ) + cross_text_features = layer_outputs_text[0] + + layer_outputs_image = self.cross_modal_image_layers[link_layer_index + 1]( + cross_image_features_, + cross_text_features_, + attention_mask=extend_image_masks, + encoder_attention_mask=extend_text_masks, + output_attentions=output_attentions, + ) + cross_image_features = layer_outputs_image[0] + + link_layer_index += 1 + + if output_hidden_states: + all_hidden_states_text += (text_embeds,) + all_hidden_states_image += (image_embeds,) + all_hidden_states_cross += ((cross_text_features, cross_image_features),) + + if output_attentions: + all_self_attentions += ((layer_outputs_text[1], layer_outputs_image[1]),) + + # Concatenate the cls token of the text and image features to get the final represtation + text_features, image_features = cross_text_features, cross_image_features + cls_features = self.get_cls_features(text_features, image_features) + + if output_hidden_states: + all_hidden_states = (all_hidden_states_text, all_hidden_states_image, all_hidden_states_cross) + + if not return_dict: + return tuple(v for v in [text_features, image_features, cls_features] if v is not None) + + return BridgeTowerModelOutput( + text_features=text_features, + image_features=image_features, + pooler_output=cls_features, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + def get_cls_features(self, text_features, image_features): + cls_features_text = self.cross_modal_text_pooler(text_features) + cls_features_image = self.cross_modal_image_pooler(image_features) + return torch.cat([cls_features_text, cls_features_image], dim=-1) + + +# Copied from transformers.models.vilt.modeling_vilt.ViltPredictionHeadTransform with Vilt->BridgeTower +class BridgeTowerPredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class BridgeTowerMLMHead(nn.Module): + def __init__(self, config, weight=None): + super().__init__() + self.config = config + self.transform = BridgeTowerPredictionHeadTransform(config) + self.decoder = nn.Linear(config.hidden_size, config.text_config.vocab_size, bias=False) + self.bias = nn.Parameter(torch.zeros(config.text_config.vocab_size)) + if weight is not None: + self.decoder.weight = weight + + def forward(self, x): + mlm_score = self.transform(x) + mlm_score = self.decoder(mlm_score) + self.bias + return mlm_score + + +class BridgeTowerITMHead(nn.Module): + def __init__(self, hidden_size): + super().__init__() + self.fc = nn.Linear(hidden_size, 2) + + def forward(self, x): + itm_score = self.fc(x) + return itm_score + + +@add_start_docstrings( + """ + BridgeTower Model with a language modeling head on top as done during pretraining. + """, + BRIDGETOWER_START_DOCSTRING, +) +class BridgeTowerForMaskedLM(BridgeTowerPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.bridgetower = BridgeTowerModel(config) + self.mlm_score = BridgeTowerMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.mlm_score.decoder + + def set_output_embeddings(self, new_embeddings): + self.mlm_score.decoder = new_embeddings + + @add_start_docstrings_to_model_forward(BRIDGETOWER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + pixel_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + image_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + ) -> Union[MaskedLMOutput, Tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels are currently not supported. + Returns: + + Examples: + + ```python + >>> from transformers import BridgeTowerProcessor, BridgeTowerForMaskedLM + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000360943.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB") + >>> text = "a looking out of the window" + + >>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm") + >>> model = BridgeTowerForMaskedLM.from_pretrained("BridgeTower/bridgetower-base-itm-mlm") + + >>> # prepare inputs + >>> encoding = processor(image, text, return_tensors="pt") + + >>> # forward pass + >>> outputs = model(**encoding) + + >>> results = processor.decode(outputs.logits.argmax(dim=-1).squeeze(0).tolist()) + + >>> print(results) + .a cat looking out of the window. + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + outputs = self.bridgetower( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + pixel_values=pixel_values, + pixel_mask=pixel_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + image_embeds=image_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + mlm_logits = self.mlm_score(outputs.text_features if return_dict else outputs[0]) + + if not return_dict: + return tuple(mlm_logits) + + return MaskedLMOutput( + logits=mlm_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + BridgeTower Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the + [CLS] token) for image-to-text matching. + """, + BRIDGETOWER_START_DOCSTRING, +) +class BridgeTowerForImageAndTextRetrieval(BridgeTowerPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.bridgetower = BridgeTowerModel(config) + + self.itm_score = BridgeTowerITMHead(config.hidden_size * 2) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BRIDGETOWER_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + pixel_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + image_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + ) -> Union[SequenceClassifierOutput, Tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels are currently not supported. + Returns: + + Examples: + + ```python + >>> from transformers import BridgeTowerProcessor, BridgeTowerForImageAndTextRetrieval + >>> import requests + >>> from PIL import Image + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"] + + >>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm") + >>> model = BridgeTowerForImageAndTextRetrieval.from_pretrained("BridgeTower/bridgetower-base-itm-mlm") + + >>> # forward pass + >>> scores = dict() + >>> for text in texts: + ... # prepare inputs + ... encoding = processor(image, text, return_tensors="pt") + ... outputs = model(**encoding) + ... scores[text] = outputs.logits[0, 1].item() + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bridgetower( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + pixel_values=pixel_values, + pixel_mask=pixel_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + image_embeds=image_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooler_output = outputs.pooler_output if return_dict else outputs[2] + + logits = self.itm_score(pooler_output) + + if not return_dict: + return tuple(logits) + + return SequenceClassifierOutput( + loss=None, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/src/transformers/models/bridgetower/processing_bridgetower.py b/src/transformers/models/bridgetower/processing_bridgetower.py new file mode 100644 index 0000000000..a92932c041 --- /dev/null +++ b/src/transformers/models/bridgetower/processing_bridgetower.py @@ -0,0 +1,118 @@ +# coding=utf-8 +# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for BridgeTower. +""" + +from typing import List, Optional, Union + +from ...processing_utils import ProcessorMixin +from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy +from ...utils import TensorType + + +class BridgeTowerProcessor(ProcessorMixin): + r""" + Constructs a BridgeTower processor which wraps a Roberta tokenizer and BridgeTower image processor into a single + processor. + + [`BridgeTowerProcessor`] offers all the functionalities of [`BridgeTowerImageProcessor`] and + [`RobertaTokenizerFast`]. See the docstring of [`~BridgeTowerProcessor.__call__`] and + [`~BridgeTowerProcessor.decode`] for more information. + + Args: + image_processor (`BridgeTowerImageProcessor`): + An instance of [`BridgeTowerImageProcessor`]. The image processor is a required input. + tokenizer (`RobertaTokenizerFast`): + An instance of ['RobertaTokenizerFast`]. The tokenizer is a required input. + """ + attributes = ["image_processor", "tokenizer"] + image_processor_class = "BridgeTowerImageProcessor" + tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast") + + def __init__(self, image_processor, tokenizer): + super().__init__(image_processor, tokenizer) + + def __call__( + self, + images, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = None, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs + ) -> BatchEncoding: + """ + This method uses [`BridgeTowerImageProcessor.__call__`] method to prepare image(s) for the model, and + [`RobertaTokenizerFast.__call__`] to prepare text for the model. + + Please refer to the docstring of the above two methods for more information. + """ + encoding = self.tokenizer( + text=text, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + return_tensors=return_tensors, + **kwargs, + ) + # add pixel_values + pixel_mask + encoding_image_processor = self.image_processor( + images, return_tensors=return_tensors, do_normalize=True, do_center_crop=True, **kwargs + ) + encoding.update(encoding_image_processor) + + return encoding + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to RobertaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to RobertaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer + to the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 32737d93db..c5c14b1a5f 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -1257,6 +1257,37 @@ class BloomPreTrainedModel(metaclass=DummyObject): requires_backends(self, ["torch"]) +BRIDGETOWER_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class BridgeTowerForImageAndTextRetrieval(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class BridgeTowerForMaskedLM(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class BridgeTowerModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class BridgeTowerPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index 6cde6701cf..6982f69b14 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -45,6 +45,13 @@ class BlipImageProcessor(metaclass=DummyObject): requires_backends(self, ["vision"]) +class BridgeTowerImageProcessor(metaclass=DummyObject): + _backends = ["vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + class ChineseCLIPFeatureExtractor(metaclass=DummyObject): _backends = ["vision"] diff --git a/tests/models/bridgetower/__init__.py b/tests/models/bridgetower/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/models/bridgetower/test_image_processing_bridgetower.py b/tests/models/bridgetower/test_image_processing_bridgetower.py new file mode 100644 index 0000000000..fe84227bad --- /dev/null +++ b/tests/models/bridgetower/test_image_processing_bridgetower.py @@ -0,0 +1,258 @@ +# coding=utf-8 +# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest +from typing import Dict, List, Optional, Union + +import numpy as np + +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torch_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingSavingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + from PIL import Image + + from transformers import BridgeTowerImageProcessor + + +class BridgeTowerImageProcessingTester(unittest.TestCase): + def __init__( + self, + parent, + do_resize: bool = True, + size: Dict[str, int] = None, + size_divisor: int = 32, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + do_center_crop: bool = True, + image_mean: Optional[Union[float, List[float]]] = [0.48145466, 0.4578275, 0.40821073], + image_std: Optional[Union[float, List[float]]] = [0.26862954, 0.26130258, 0.27577711], + do_pad: bool = True, + batch_size=7, + min_resolution=30, + max_resolution=400, + num_channels=3, + ): + self.parent = parent + self.do_resize = do_resize + self.size = size if size is not None else {"shortest_edge": 288} + self.size_divisor = size_divisor + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.do_center_crop = do_center_crop + self.image_mean = image_mean + self.image_std = image_std + self.do_pad = do_pad + self.batch_size = batch_size + self.num_channels = num_channels + self.min_resolution = min_resolution + self.max_resolution = max_resolution + + def prepare_image_processor_dict(self): + return { + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_normalize": self.do_normalize, + "do_resize": self.do_resize, + "size": self.size, + "size_divisor": self.size_divisor, + } + + def get_expected_values(self, image_inputs, batched=False): + """ + This function computes the expected height and width when providing images to BridgeTowerImageProcessor, + assuming do_resize is set to True with a scalar size and size_divisor. + """ + if not batched: + size = self.size["shortest_edge"] + image = image_inputs[0] + if isinstance(image, Image.Image): + w, h = image.size + else: + h, w = image.shape[1], image.shape[2] + scale = size / min(w, h) + if h < w: + newh, neww = size, scale * w + else: + newh, neww = scale * h, size + + max_size = int((1333 / 800) * size) + if max(newh, neww) > max_size: + scale = max_size / max(newh, neww) + newh = newh * scale + neww = neww * scale + + newh, neww = int(newh + 0.5), int(neww + 0.5) + expected_height, expected_width = ( + newh // self.size_divisor * self.size_divisor, + neww // self.size_divisor * self.size_divisor, + ) + + else: + expected_values = [] + for image in image_inputs: + expected_height, expected_width = self.get_expected_values([image]) + expected_values.append((expected_height, expected_width)) + expected_height = max(expected_values, key=lambda item: item[0])[0] + expected_width = max(expected_values, key=lambda item: item[1])[1] + + return expected_height, expected_width + + +@require_torch +@require_vision +class BridgeTowerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): + image_processing_class = BridgeTowerImageProcessor if is_vision_available() else None + + def setUp(self): + self.image_processor_tester = BridgeTowerImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "size_divisor")) + + def test_batch_feature(self): + pass + + def test_call_pil(self): + # Initialize feature_extractor + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random PIL images + image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False) + for image in image_inputs: + self.assertIsInstance(image, Image.Image) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + + expected_height, expected_width = self.image_processor_tester.get_expected_values(image_inputs) + self.assertEqual( + encoded_images.shape, + (1, self.image_processor_tester.num_channels, expected_height, expected_width), + ) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + + expected_height, expected_width = self.image_processor_tester.get_expected_values(image_inputs, batched=True) + self.assertEqual( + encoded_images.shape, + ( + self.image_processor_tester.batch_size, + self.image_processor_tester.num_channels, + expected_height, + expected_width, + ), + ) + + def test_call_numpy(self): + # Initialize feature_extractor + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random numpy tensors + image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, numpify=True) + for image in image_inputs: + self.assertIsInstance(image, np.ndarray) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + + expected_height, expected_width = self.image_processor_tester.get_expected_values(image_inputs) + self.assertEqual( + encoded_images.shape, + (1, self.image_processor_tester.num_channels, expected_height, expected_width), + ) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + + expected_height, expected_width = self.image_processor_tester.get_expected_values(image_inputs, batched=True) + self.assertEqual( + encoded_images.shape, + ( + self.image_processor_tester.batch_size, + self.image_processor_tester.num_channels, + expected_height, + expected_width, + ), + ) + + def test_call_pytorch(self): + # Initialize feature_extractor + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random PyTorch tensors + image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True) + for image in image_inputs: + self.assertIsInstance(image, torch.Tensor) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + + expected_height, expected_width = self.image_processor_tester.get_expected_values(image_inputs) + self.assertEqual( + encoded_images.shape, + (1, self.image_processor_tester.num_channels, expected_height, expected_width), + ) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + + expected_height, expected_width = self.image_processor_tester.get_expected_values(image_inputs, batched=True) + self.assertEqual( + encoded_images.shape, + ( + self.image_processor_tester.batch_size, + self.image_processor_tester.num_channels, + expected_height, + expected_width, + ), + ) + + def test_equivalence_pad_and_create_pixel_mask(self): + # Initialize feature_extractors + image_processing_1 = self.image_processing_class(**self.image_processor_dict) + image_processing_2 = self.image_processing_class(do_resize=False, do_normalize=False, do_rescale=False) + # create random PyTorch tensors + image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True) + for image in image_inputs: + self.assertIsInstance(image, torch.Tensor) + + # Test whether the method "pad_and_return_pixel_mask" and calling the image processor return the same tensors + encoded_images_with_method = image_processing_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt") + encoded_images = image_processing_2(image_inputs, return_tensors="pt") + + self.assertTrue( + torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4) + ) + self.assertTrue( + torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4) + ) diff --git a/tests/models/bridgetower/test_modeling_bridgetower.py b/tests/models/bridgetower/test_modeling_bridgetower.py new file mode 100644 index 0000000000..4be58f450e --- /dev/null +++ b/tests/models/bridgetower/test_modeling_bridgetower.py @@ -0,0 +1,409 @@ +# coding=utf-8 +# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the PyTorch BridgeTower model. """ + +import tempfile +import unittest + +import numpy as np + +from transformers import BridgeTowerConfig, is_torch_available, is_vision_available +from transformers.testing_utils import require_torch, require_vision, slow, torch_device +from transformers.utils import cached_property + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask + + +if is_torch_available(): + import torch + + from transformers import BridgeTowerForImageAndTextRetrieval, BridgeTowerForMaskedLM, BridgeTowerModel + from transformers.models.bridgetower.modeling_bridgetower import BRIDGETOWER_PRETRAINED_MODEL_ARCHIVE_LIST + from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_10 +else: + is_torch_greater_or_equal_than_1_10 = False + +if is_vision_available(): + from PIL import Image + + from transformers import BridgeTowerProcessor + + +class BridgeTowerModelTester: + def __init__( + self, + parent, + share_cross_modal_transformer_layers=True, + drop_rate=0.1, + head_hidden_scale=2, + hidden_act="gelu", + hidden_size=768, + initializer_factor=1, + is_encoder_decoder=False, + layer_norm_eps=1e-05, + share_link_tower_layers=False, + link_tower_type="add", + num_attention_heads=12, + num_hidden_layers=6, + tie_word_embeddings=False, + init_layernorm_from_vision_encoder=False, + output_hidden_states=False, + text_config=None, + vision_config=None, + image_size=288, + ): + self.parent = parent + self.share_cross_modal_transformer_layers = share_cross_modal_transformer_layers + self.drop_rate = drop_rate + self.head_hidden_scale = head_hidden_scale + self.hidden_act = hidden_act + self.hidden_size = hidden_size + self.initializer_factor = initializer_factor + self.is_encoder_decoder = is_encoder_decoder + self.layer_norm_eps = layer_norm_eps + self.share_link_tower_layers = share_link_tower_layers + self.link_tower_type = link_tower_type + self.num_attention_heads = num_attention_heads + self.num_hidden_layers = num_hidden_layers + self.tie_word_embeddings = tie_word_embeddings + self.init_layernorm_from_vision_encoder = init_layernorm_from_vision_encoder + self.vocab_size = 50265 + self.num_channels = 3 + self.seq_length = 4 + self.num_image_features = 325 + self.batch_size = 1 + self.image_size = image_size + self.is_training = False + self.expected_num_hidden_layers = 32 + self.output_hidden_states = output_hidden_states + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + attention_mask = random_attention_mask([self.batch_size, self.seq_length]) + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + pixel_mask = random_attention_mask([self.batch_size, self.image_size, self.image_size]) + config = self.get_config() + return (config, input_ids, attention_mask, pixel_values, pixel_mask) + + def get_config(self): + return BridgeTowerConfig( + share_cross_modal_transformer_layers=self.share_cross_modal_transformer_layers, + drop_rate=self.drop_rate, + head_hidden_scale=self.head_hidden_scale, + hidden_act=self.hidden_act, + hidden_size=self.hidden_size, + initializer_factor=self.initializer_factor, + image_size=self.image_size, + is_encoder_decoder=self.is_encoder_decoder, + layer_norm_eps=self.layer_norm_eps, + share_link_tower_layers=self.share_link_tower_layers, + link_tower_type=self.link_tower_type, + num_attention_heads=self.num_attention_heads, + num_hidden_layers=self.num_hidden_layers, + tie_word_embeddings=self.tie_word_embeddings, + init_layernorm_from_vision_encoder=self.init_layernorm_from_vision_encoder, + num_channels=self.num_channels, + output_hidden_states=self.output_hidden_states, + ) + + def create_and_check_model( + self, + config, + input_ids, + attention_mask, + pixel_values, + pixel_mask, + ): + model = BridgeTowerModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values, pixel_mask=pixel_mask) + result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values) + self.parent.assertEqual(result["text_features"].shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual( + result["image_features"].shape, (self.batch_size, self.num_image_features, self.hidden_size) + ) + self.parent.assertEqual(result["pooler_output"].shape, (self.batch_size, 2 * self.hidden_size)) + + def create_and_check_for_image_and_text_retrieval( + self, + config, + input_ids, + attention_mask, + pixel_values, + pixel_mask, + ): + bridgetower_itm_output_last_dimension = 2 + + model = BridgeTowerForImageAndTextRetrieval(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values, pixel_mask=pixel_mask) + result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values) + + self.parent.assertEqual(result.logits.shape, (self.batch_size, bridgetower_itm_output_last_dimension)) + + def create_and_check_for_masked_language_modeling( + self, + config, + input_ids, + attention_mask, + pixel_values, + pixel_mask, + ): + model = BridgeTowerForMaskedLM(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values, pixel_mask=pixel_mask) + result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values) + + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + (config, input_ids, attention_mask, pixel_values, pixel_mask) = config_and_inputs + inputs_dict = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "pixel_values": pixel_values, + "pixel_mask": pixel_mask, + } + return config, inputs_dict + + +@require_torch +@unittest.skipIf(not is_torch_greater_or_equal_than_1_10, "BridgeTower is only available in torch v1.10+") +class BridgeTowerModelTest(ModelTesterMixin, unittest.TestCase): + all_model_classes = ( + (BridgeTowerModel, BridgeTowerForImageAndTextRetrieval, BridgeTowerForMaskedLM) if is_torch_available() else () + ) + + is_training = False + test_headmasking = False + test_pruning = False + test_torchscript = False + test_resize_embeddings = False + has_attentions = False + + # function to extract meaningful tensor from output per different model_class + def extract_output(self, outputs, model_class): + return outputs["pooler_output"] if model_class == "BridgeTowerModel" else outputs["logits"] + + def setUp(self): + self.model_tester = BridgeTowerModelTester(self) + self.config_tester = ConfigTester(self, config_class=BridgeTowerConfig, hidden_size=37, vocab_size=50265) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_image_and_text_retrieval(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_and_text_retrieval(*config_and_inputs) + + def test_for_masked_language_modeling(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_masked_language_modeling(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + for model_name in BRIDGETOWER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = BridgeTowerModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + # Override as extracting meaningful tensor from output is different for BridgeTower + def test_save_load(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**input_dict) + + out_2 = self.extract_output(outputs, model_class.__name__) + out_2 = out_2.cpu().numpy() + out_2[np.isnan(out_2)] = 0 + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model = model_class.from_pretrained(tmpdirname) + model.to(torch_device) + with torch.no_grad(): + after_outputs = model(**input_dict) + + # Make sure we don't have nans + out_1 = self.extract_output(after_outputs, model_class.__name__) + out_1 = out_1.cpu().numpy() + out_1[np.isnan(out_1)] = 0 + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) + + # Override this as `hidden states output` is different for BridgeTower + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states_text, hidden_states_vision, hidden_states_cross = ( + outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states + ) + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + ) + self.assertEqual( + sum((len(hidden_states_text), len(hidden_states_vision), len(hidden_states_cross))), + expected_num_layers, + ) + + seq_length = self.model_tester.seq_length + num_image_features = self.model_tester.num_image_features + + self.assertListEqual( + list(hidden_states_text[0].shape[-2:]), + [seq_length, self.model_tester.hidden_size], + ) + self.assertListEqual( + list(hidden_states_vision[0].shape), + [num_image_features, 1, self.model_tester.hidden_size], + ) + self.assertListEqual( + list(hidden_states_cross[0][0].shape[-2:]), + [seq_length, self.model_tester.hidden_size], + ) + self.assertListEqual( + list(hidden_states_cross[0][1].shape[-2:]), + [num_image_features, self.model_tester.hidden_size], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + check_hidden_states_output(inputs_dict, config, model_class) + + # Override as `hidden states output` is different for BridgeTower + def test_retain_grad_hidden_states_attentions(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.output_hidden_states = True + config.output_attentions = self.has_attentions + + # no need to test all models as different heads yield the same functionality + model_class = self.all_model_classes[0] + model = model_class(config) + model.to(torch_device) + + inputs = self._prepare_for_class(inputs_dict, model_class) + + outputs = model(**inputs) + + output = outputs[0] + + # Encoder-/Decoder-only models + hidden_states = outputs.hidden_states[0][0] + hidden_states.retain_grad() + + if self.has_attentions: + attentions = outputs.attentions[0][0] + attentions.retain_grad() + + output.flatten()[0].backward(retain_graph=True) + + self.assertIsNotNone(hidden_states.grad) + + if self.has_attentions: + self.assertIsNotNone(attentions.grad) + + @unittest.skip(reason="""Bridge Tower does not have input/output embeddings. So this test is not applicable.""") + def test_model_common_attributes(self): + pass + + @unittest.skip(reason="""Bridge Tower does not have input/output embeddings. Thus this test is not applicable.""") + def test_inputs_embeds(self): + pass + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +@unittest.skipIf(not is_torch_greater_or_equal_than_1_10, "BridgeTower is only available in torch v1.10+") +class BridgeTowerModelIntegrationTest(unittest.TestCase): + @cached_property + def default_processor(self): + return ( + BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm") + if is_vision_available() + else None + ) + + @slow + def test_image_and_text_retrieval(self): + model = BridgeTowerForImageAndTextRetrieval.from_pretrained("BridgeTower/bridgetower-base-itm-mlm").to( + torch_device + ) + model.eval() + processor = self.default_processor + image = prepare_img() + text = "a bunch of cats laying on a tower." + inputs = processor(image, text, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the logits + expected_shape = torch.Size([1, 2]) + self.assertEqual(outputs.logits.shape, expected_shape) + self.assertTrue(outputs.logits[0, 1].item() > outputs.logits[0, 0].item()) + + @slow + def test_masked_language_modeling(self): + model = BridgeTowerForMaskedLM.from_pretrained("BridgeTower/bridgetower-base-itm-mlm").to(torch_device) + model.eval() + processor = self.default_processor + image = prepare_img() + text = "a bunch of laying on a tower." + inputs = processor(image, text, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the logits + expected_shape = torch.Size([1, 11, 50265]) + self.assertEqual(outputs.logits.shape, expected_shape) + + # verify predicted word + predicted_id = outputs.logits.argmax(dim=-1).squeeze(0).tolist()[4] + self.assertTrue(processor.decode([predicted_id]) == " cats") diff --git a/utils/check_repo.py b/utils/check_repo.py index eb12470b26..d8756fe59d 100755 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -45,6 +45,8 @@ PRIVATE_MODELS = [ "TFDPRSpanPredictor", "MaskFormerSwinModel", "MaskFormerSwinPreTrainedModel", + "BridgeTowerTextModel", + "BridgeTowerVisionModel", ] # Update this list for models that are not tested with a comment explaining the reason it should not be. @@ -127,6 +129,8 @@ IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [ "TFSegformerDecodeHead", # Not a regular model. "AltRobertaModel", # Building part of bigger (tested) model. "BlipTextLMHeadModel", # No need to test it as it is tested by BlipTextVision models + "BridgeTowerTextModel", # No need to test it as it is tested by BridgeTowerModel model. + "BridgeTowerVisionModel", # No need to test it as it is tested by BridgeTowerModel model. ] # Update this list with test files that don't have a tester with a `all_model_classes` variable and which don't @@ -163,6 +167,8 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [ "BlipTextLMHeadModel", "BlipTextModel", "Swin2SRForImageSuperResolution", + "BridgeTowerForImageAndTextRetrieval", + "BridgeTowerForMaskedLM", "CLIPSegForImageSegmentation", "CLIPSegVisionModel", "CLIPSegTextModel",