From 65b20b739b69cc13a10a6b10351ba31f4ecbef8c Mon Sep 17 00:00:00 2001 From: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Date: Wed, 8 Dec 2021 14:20:34 +0100 Subject: [PATCH] Add Perceiver IO (#14487) * First draft * Style and remove mlm * Make forward pass work * More improvements * More improvements * Fix bug * More improvements * More improvements * Add PerceiverTokenizer first draft * Improve conversion script * More improvements * Make conversion script work for the encoder * Make conversion script work with local pickle files * Style & quality, fix-copies * Add dummy input to conversion script * Add absolute position embeddings to TextPreProcessor * Make forward pass of encoder work * More improvements * Move text preprocessor to separate script * More improvements * More improvements * Add post processor * Make MLM model work * Style * Add PerceiverForMaskedLM * Add PerceiverImagePreprocessor * Make style * Make PerceiverForImageClassification work * More improvements * More improvements * Use tokenizer in conversion script * Use PerceiverForMaskedLM in conversion script * Define custom PerceiverModelOutput * Improve PerceiverAttention to make it work for both MLM and image classification * More improvements * More improvements * More improvements to the conversion script * Make conversion script work for both MLM and image classification * Add PerceiverFeatureExtractor * More improvements * Style and quality * Add center cropping * Fix bug * Small fix * Add print statement * Fix bug in image preprocessor * Fix bug with conversion script * Make output position embeddings an nn.Parameter layer instead of nn.Embedding * Comment out print statements * Add position encoding classes * More improvements * Use position_encoding_kwargs * Add PerceiverForImageClassificationFourier * Make style & quality * Add PerceiverForImageClassificationConvProcessing * Style & quality * Add flow model * Move processors to modeling file * Make position encodings modular * Make basic decoder use modular position encodings * Add PerceiverForOpticalFlow to conversion script * Add AudioPreprocessor * Make it possible for the basic decoder to use Fourier position embeddings * Add PerceiverForMultimodalAutoencoding * Improve model for optical flow * Improve _build_network_inputs method * Add print statement * Fix device issue * Fix device of Fourier embeddings * Add print statements for debugging * Add another print statement * Add another print statement * Add another print statement * Add another print statement * Improve PerceiverAudioPreprocessor * Improve conversion script for multimodal modal * More improvements * More improvements * Improve multimodal model * Make forward pass multimodal model work * More improvements * Improve tests * Fix some more tests * Add output dataclasses * Make more tests pass * Add print statements for debuggin * Add tests for image classification * Add PerceiverClassifierOutput * More improvements * Make more tests pass for the optical flow model * Make style & quality * Small improvements * Don't support training for optical flow model for now * Fix _prepare_for_class for tests * Make more tests pass, add some docs * Add multimodal model to tests * Minor fixes * Fix tests * Improve conversion script * Make fixup * Remove pos_dim argument * Fix device issue * Potential fix for OOM * Revert previous commit * Fix test_initialization * Add print statements for debugging * Fix print statement * Add print statement * Add print statement * Add print statement * Add print statement * Add print statement * Add print statement * Remove need for output_shape * Comment out output_shape * Remove unnecessary code * Improve docs * Fix make fixup * Remove PerceiverTextProcessor from init * Improve docs * Small improvement * Apply first batch of suggestions from code review * Apply more suggestions from code review * Update docstrings * Define dicts beforehand for readability * Rename task to architecture in conversion script, include PerceiverModel in tests * Add print statements for debugging * Fix tests on GPU * Remove preprocessors, postprocessors and decoders from main init * Add integration test * Fix docs * Replace einops by torch * Update for new docs frontend * Rename PerceiverForImageClassification * Improve docs * Improve docs * Improve docs of PerceiverModel * Fix some more tests * Improve center_crop * Add PerceiverForSequenceClassification * Small improvements * Fix tests * Add integration test for optical flow model * Clean up * Add tests for tokenizer * Fix tokenizer by adding special tokens properly * Fix CI --- README.md | 1 + README_ko.md | 1 + README_zh-hans.md | 1 + README_zh-hant.md | 1 + docs/source/index.mdx | 2 + docs/source/index.rst | 718 ++++ docs/source/model_doc/perceiver.rst | 234 ++ src/transformers/__init__.py | 32 + src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 3 + src/transformers/models/auto/modeling_auto.py | 11 + src/transformers/models/perceiver/__init__.py | 72 + .../perceiver/configuration_perceiver.py | 171 + .../convert_perceiver_haiku_to_pytorch.py | 468 +++ .../perceiver/feature_extraction_perceiver.py | 189 + .../models/perceiver/modeling_perceiver.py | 3118 +++++++++++++++++ .../perceiver/tokenization_perceiver.py | 204 ++ src/transformers/utils/dummy_pt_objects.py | 81 + .../utils/dummy_vision_objects.py | 5 + tests/test_modeling_perceiver.py | 989 ++++++ tests/test_tokenization_perceiver.py | 288 ++ utils/check_repo.py | 2 + 22 files changed, 6592 insertions(+) create mode 100644 docs/source/index.rst create mode 100644 docs/source/model_doc/perceiver.rst create mode 100644 src/transformers/models/perceiver/__init__.py create mode 100644 src/transformers/models/perceiver/configuration_perceiver.py create mode 100644 src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py create mode 100644 src/transformers/models/perceiver/feature_extraction_perceiver.py create mode 100755 src/transformers/models/perceiver/modeling_perceiver.py create mode 100644 src/transformers/models/perceiver/tokenization_perceiver.py create mode 100644 tests/test_modeling_perceiver.py create mode 100644 tests/test_tokenization_perceiver.py diff --git a/README.md b/README.md index 878d854467..8624eabcb0 100644 --- a/README.md +++ b/README.md @@ -286,6 +286,7 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. +1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen. 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius. diff --git a/README_ko.md b/README_ko.md index a47e5c171d..47c3ea033c 100644 --- a/README_ko.md +++ b/README_ko.md @@ -265,6 +265,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. +1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen. 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius. diff --git a/README_zh-hans.md b/README_zh-hans.md index 57ec6ca17e..07ca87440c 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -289,6 +289,7 @@ conda install -c huggingface transformers 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (来自 Microsoft Research) 伴随论文 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 由 Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 发布。 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。 +1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (来自 Deepmind) 伴随论文 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 由 Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 发布。 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (来自 VinAI Research) 伴随论文 [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) 由 Dat Quoc Nguyen and Anh Tuan Nguyen 发布。 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (来自 NVIDIA) 伴随论文 [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) 由 Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius 发布。 diff --git a/README_zh-hant.md b/README_zh-hant.md index f548b4f4a4..81c3a39bc0 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -301,6 +301,7 @@ conda install -c huggingface transformers 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. +1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen. 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius. diff --git a/docs/source/index.mdx b/docs/source/index.mdx index 4a7c597d08..bdf844b2c4 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -146,6 +146,7 @@ conversion utilities for the following models. 1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. 1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. +1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. 1. **[PhoBERT](model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen. 1. **[ProphetNet](model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. 1. **[QDQBert](model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius. @@ -234,6 +235,7 @@ Flax), PyTorch, and/or TensorFlow. | OpenAI GPT | ✅ | ✅ | ✅ | ✅ | ❌ | | OpenAI GPT-2 | ✅ | ✅ | ✅ | ✅ | ✅ | | Pegasus | ✅ | ✅ | ✅ | ✅ | ✅ | +| Perceiver | ✅ | ❌ | ✅ | ❌ | ❌ | | ProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ | | QDQBert | ❌ | ❌ | ✅ | ❌ | ❌ | | RAG | ✅ | ❌ | ✅ | ✅ | ❌ | diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000000..003e5b867f --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,718 @@ +Transformers +======================================================================================================================= + +State-of-the-art Natural Language Processing for Jax, Pytorch and TensorFlow + +🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides general-purpose +architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural Language Understanding (NLU) and Natural +Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between Jax, +PyTorch and TensorFlow. + +This is the documentation of our repository `transformers `__. You can +also follow our `online course `__ that teaches how to use this library, as well as the +other libraries developed by Hugging Face and the Hub. + +If you are looking for custom support from the Hugging Face team +----------------------------------------------------------------------------------------------------------------------- + +.. raw:: html + + + HuggingFace Expert Acceleration Program +
+ +Features +----------------------------------------------------------------------------------------------------------------------- + +- High performance on NLU and NLG tasks +- Low barrier to entry for educators and practitioners + +State-of-the-art NLP for everyone: + +- Deep learning researchers +- Hands-on practitioners +- AI/ML/NLP teachers and educators + +.. + Copyright 2020 The HuggingFace Team. All rights reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + specific language governing permissions and limitations under the License. + +Lower compute costs, smaller carbon footprint: + +- Researchers can share trained models instead of always retraining +- Practitioners can reduce compute time and production costs +- 8 architectures with over 30 pretrained models, some in more than 100 languages + +Choose the right framework for every part of a model's lifetime: + +- Train state-of-the-art models in 3 lines of code +- Deep interoperability between Jax, Pytorch and TensorFlow models +- Move a single model between Jax/PyTorch/TensorFlow frameworks at will +- Seamlessly pick the right framework for training, evaluation, production + +The support for Jax is still experimental (with a few models right now), expect to see it grow in the coming months! + +`All the model checkpoints `__ are seamlessly integrated from the huggingface.co `model +hub `__ where they are uploaded directly by `users `__ and +`organizations `__. + +Current number of checkpoints: |checkpoints| + +.. |checkpoints| image:: https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen + +Contents +----------------------------------------------------------------------------------------------------------------------- + +The documentation is organized in five parts: + +- **GET STARTED** contains a quick tour, the installation instructions and some useful information about our philosophy + and a glossary. +- **USING 🤗 TRANSFORMERS** contains general tutorials on how to use the library. +- **ADVANCED GUIDES** contains more advanced guides that are more specific to a given script or part of the library. +- **RESEARCH** focuses on tutorials that have less to do with how to use the library but more about general research in + transformers model +- The three last section contain the documentation of each public class and function, grouped in: + + - **MAIN CLASSES** for the main classes exposing the important APIs of the library. + - **MODELS** for the classes and functions related to each model implemented in the library. + - **INTERNAL HELPERS** for the classes and functions we use internally. + +The library currently contains Jax, PyTorch and Tensorflow implementations, pretrained model weights, usage scripts and +conversion utilities for the following models. + +Supported models +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. + This list is updated automatically from the README with `make fix-copies`. Do not update manually! + +1. :doc:`ALBERT ` (from Google Research and the Toyota Technological Institute at Chicago) released + with the paper `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations + `__, by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush + Sharma, Radu Soricut. +2. :doc:`BART ` (from Facebook) released with the paper `BART: Denoising Sequence-to-Sequence + Pre-training for Natural Language Generation, Translation, and Comprehension + `__ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman + Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer. +3. :doc:`BARThez ` (from École polytechnique) released with the paper `BARThez: a Skilled Pretrained + French Sequence-to-Sequence Model `__ by Moussa Kamal Eddine, Antoine J.-P. + Tixier, Michalis Vazirgiannis. +4. :doc:`BARTpho ` (from VinAI Research) released with the paper `BARTpho: Pre-trained + Sequence-to-Sequence Models for Vietnamese `__ by Nguyen Luong Tran, Duong Minh Le + and Dat Quoc Nguyen. +5. :doc:`BEiT ` (from Microsoft) released with the paper `BEiT: BERT Pre-Training of Image Transformers + `__ by Hangbo Bao, Li Dong, Furu Wei. +6. :doc:`BERT ` (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional + Transformers for Language Understanding `__ by Jacob Devlin, Ming-Wei Chang, + Kenton Lee and Kristina Toutanova. +7. :doc:`BERTweet ` (from VinAI Research) released with the paper `BERTweet: A pre-trained language + model for English Tweets `__ by Dat Quoc Nguyen, Thanh Vu and Anh Tuan + Nguyen. +8. :doc:`BERT For Sequence Generation ` (from Google) released with the paper `Leveraging + Pre-trained Checkpoints for Sequence Generation Tasks `__ by Sascha Rothe, Shashi + Narayan, Aliaksei Severyn. +9. :doc:`BigBird-RoBERTa ` (from Google Research) released with the paper `Big Bird: Transformers + for Longer Sequences `__ by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua + Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. +10. :doc:`BigBird-Pegasus ` (from Google Research) released with the paper `Big Bird: + Transformers for Longer Sequences `__ by Manzil Zaheer, Guru Guruganesh, Avinava + Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr + Ahmed. +11. :doc:`Blenderbot ` (from Facebook) released with the paper `Recipes for building an + open-domain chatbot `__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary + Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. +12. :doc:`BlenderbotSmall ` (from Facebook) released with the paper `Recipes for building + an open-domain chatbot `__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, + Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. +13. :doc:`BORT ` (from Alexa) released with the paper `Optimal Subarchitecture Extraction For BERT + `__ by Adrian de Wynter and Daniel J. Perry. +14. :doc:`ByT5 ` (from Google Research) released with the paper `ByT5: Towards a token-free future with + pre-trained byte-to-byte models `__ by Linting Xue, Aditya Barua, Noah Constant, + Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel. +15. :doc:`CamemBERT ` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty + French Language Model `__ by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz + Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot. +16. :doc:`CANINE ` (from Google Research) released with the paper `CANINE: Pre-training an Efficient + Tokenization-Free Encoder for Language Representation `__ by Jonathan H. Clark, + Dan Garrette, Iulia Turc, John Wieting. +17. :doc:`CLIP ` (from OpenAI) released with the paper `Learning Transferable Visual Models From + Natural Language Supervision `__ by Alec Radford, Jong Wook Kim, Chris Hallacy, + Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen + Krueger, Ilya Sutskever. +18. :doc:`ConvBERT ` (from YituTech) released with the paper `ConvBERT: Improving BERT with + Span-based Dynamic Convolution `__ by Zihang Jiang, Weihao Yu, Daquan Zhou, + Yunpeng Chen, Jiashi Feng, Shuicheng Yan. +19. :doc:`CPM ` (from Tsinghua University) released with the paper `CPM: A Large-scale Generative + Chinese Pre-trained Language Model `__ by Zhengyan Zhang, Xu Han, Hao Zhou, Pei + Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, + Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, + Juanzi Li, Xiaoyan Zhu, Maosong Sun. +20. :doc:`CTRL ` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language + Model for Controllable Generation `__ by Nitish Shirish Keskar*, Bryan McCann*, + Lav R. Varshney, Caiming Xiong and Richard Socher. +21. :doc:`DeBERTa ` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with + Disentangled Attention `__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu + Chen. +22. :doc:`DeBERTa-v2 ` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT + with Disentangled Attention `__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, + Weizhu Chen. +23. :doc:`DeiT ` (from Facebook) released with the paper `Training data-efficient image transformers & + distillation through attention `__ by Hugo Touvron, Matthieu Cord, Matthijs + Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. +24. :doc:`DETR ` (from Facebook) released with the paper `End-to-End Object Detection with Transformers + `__ by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, + Alexander Kirillov, Sergey Zagoruyko. +25. :doc:`DialoGPT ` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale + Generative Pre-training for Conversational Response Generation `__ by Yizhe + Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan. +26. :doc:`DistilBERT ` (from HuggingFace), released together with the paper `DistilBERT, a + distilled version of BERT: smaller, faster, cheaper and lighter `__ by Victor + Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2 + `__, RoBERTa into `DistilRoBERTa + `__, Multilingual BERT into + `DistilmBERT `__ and a German + version of DistilBERT. +27. :doc:`DPR ` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain + Question Answering `__ by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick + Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. +28. :doc:`EncoderDecoder ` (from Google Research) released with the paper `Leveraging + Pre-trained Checkpoints for Sequence Generation Tasks `__ by Sascha Rothe, Shashi + Narayan, Aliaksei Severyn. +29. :doc:`ELECTRA ` (from Google Research/Stanford University) released with the paper `ELECTRA: + Pre-training text encoders as discriminators rather than generators `__ by Kevin + Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. +30. :doc:`FlauBERT ` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model + Pre-training for French `__ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, + Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. +31. :doc:`FNet ` (from Google Research) released with the paper `FNet: Mixing Tokens with Fourier + Transforms `__ by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago + Ontanon. +32. :doc:`Funnel Transformer ` (from CMU/Google Brain) released with the paper `Funnel-Transformer: + Filtering out Sequential Redundancy for Efficient Language Processing `__ by + Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le. +33. :doc:`GPT ` (from OpenAI) released with the paper `Improving Language Understanding by Generative + Pre-Training `__ by Alec Radford, Karthik Narasimhan, Tim Salimans + and Ilya Sutskever. +34. :doc:`GPT-2 ` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask + Learners `__ by Alec Radford*, Jeffrey Wu*, Rewon Child, David + Luan, Dario Amodei** and Ilya Sutskever**. +35. :doc:`GPT-J ` (from EleutherAI) released in the repository `kingoflolz/mesh-transformer-jax + `__ by Ben Wang and Aran Komatsuzaki. +36. :doc:`GPT Neo ` (from EleutherAI) released in the repository `EleutherAI/gpt-neo + `__ by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy. +37. :doc:`Hubert ` (from Facebook) released with the paper `HuBERT: Self-Supervised Speech + Representation Learning by Masked Prediction of Hidden Units `__ by Wei-Ning Hsu, + Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. +38. :doc:`I-BERT ` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization + `__ by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. +39. `ImageGPT `__ (from OpenAI) released with the + paper `Generative Pretraining from Pixels `__ by Mark Chen, Alec Radford, Rewon + Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever. +40. :doc:`LayoutLM ` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training + of Text and Layout for Document Image Understanding `__ by Yiheng Xu, Minghao Li, + Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou. +41. :doc:`LayoutLMv2 ` (from Microsoft Research Asia) released with the paper `LayoutLMv2: + Multi-modal Pre-training for Visually-Rich Document Understanding `__ by Yang Xu, + Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min + Zhang, Lidong Zhou. +42. :doc:`LayoutXLM ` (from Microsoft Research Asia) released with the paper `LayoutXLM: + Multimodal Pre-training for Multilingual Visually-rich Document Understanding `__ + by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei. +43. :doc:`LED ` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer + `__ by Iz Beltagy, Matthew E. Peters, Arman Cohan. +44. :doc:`Longformer ` (from AllenAI) released with the paper `Longformer: The Long-Document + Transformer `__ by Iz Beltagy, Matthew E. Peters, Arman Cohan. +45. :doc:`LUKE ` (from Studio Ousia) released with the paper `LUKE: Deep Contextualized Entity + Representations with Entity-aware Self-attention `__ by Ikuya Yamada, Akari Asai, + Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto. +46. :doc:`LXMERT ` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality + Encoder Representations from Transformers for Open-Domain Question Answering `__ + by Hao Tan and Mohit Bansal. +47. :doc:`M2M100 ` (from Facebook) released with the paper `Beyond English-Centric Multilingual + Machine Translation `__ by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, + Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, + Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin. +48. :doc:`MarianMT ` Machine translation models trained using `OPUS `__ data by + Jörg Tiedemann. The `Marian Framework `__ is being developed by the Microsoft + Translator Team. +49. :doc:`MBart ` (from Facebook) released with the paper `Multilingual Denoising Pre-training for + Neural Machine Translation `__ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, + Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer. +50. :doc:`MBart-50 ` (from Facebook) released with the paper `Multilingual Translation with Extensible + Multilingual Pretraining and Finetuning `__ by Yuqing Tang, Chau Tran, Xian Li, + Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan. +51. :doc:`Megatron-BERT ` (from NVIDIA) released with the paper `Megatron-LM: Training + Multi-Billion Parameter Language Models Using Model Parallelism `__ by Mohammad + Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro. +52. :doc:`Megatron-GPT2 ` (from NVIDIA) released with the paper `Megatron-LM: Training + Multi-Billion Parameter Language Models Using Model Parallelism `__ by Mohammad + Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro. +53. :doc:`MPNet ` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted + Pre-training for Language Understanding `__ by Kaitao Song, Xu Tan, Tao Qin, + Jianfeng Lu, Tie-Yan Liu. +54. :doc:`MT5 ` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained + text-to-text transformer `__ by Linting Xue, Noah Constant, Adam Roberts, Mihir + Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. +55. :doc:`Pegasus ` (from Google) released with the paper `PEGASUS: Pre-training with Extracted + Gap-sentences for Abstractive Summarization `__ by Jingqing Zhang, Yao Zhao, + Mohammad Saleh and Peter J. Liu. +56. `Perceiver IO `__ (from Deepmind) released + with the paper `Perceiver IO: A General Architecture for Structured Inputs & Outputs + `__ by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, + Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. + Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. +57. :doc:`PhoBERT ` (from VinAI Research) released with the paper `PhoBERT: Pre-trained language + models for Vietnamese `__ by Dat Quoc Nguyen and Anh Tuan + Nguyen. +58. :doc:`ProphetNet ` (from Microsoft Research) released with the paper `ProphetNet: Predicting + Future N-gram for Sequence-to-Sequence Pre-training `__ by Yu Yan, Weizhen Qi, + Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. +59. :doc:`QDQBert ` (from NVIDIA) released with the paper `Integer Quantization for Deep Learning + Inference: Principles and Empirical Evaluation `__ by Hao Wu, Patrick Judd, + Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius. +60. :doc:`Reformer ` (from Google Research) released with the paper `Reformer: The Efficient + Transformer `__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. +61. :doc:`RemBERT ` (from Google Research) released with the paper `Rethinking embedding coupling in + pre-trained language models `__ by Hyung Won Chung, Thibault Févry, Henry + Tsai, M. Johnson, Sebastian Ruder. +62. :doc:`RoBERTa ` (from Facebook), released together with the paper a `Robustly Optimized BERT + Pretraining Approach `__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar + Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. +63. :doc:`RoFormer ` (from ZhuiyiTechnology), released together with the paper a `RoFormer: + Enhanced Transformer with Rotary Position Embedding `__ by Jianlin Su and + Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu. +64. :doc:`SegFormer ` (from NVIDIA) released with the paper `SegFormer: Simple and Efficient + Design for Semantic Segmentation with Transformers `__ by Enze Xie, Wenhai Wang, + Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo. +65. :doc:`SEW ` (from ASAPP) released with the paper `Performance-Efficiency Trade-offs in Unsupervised + Pre-training for Speech Recognition `__ by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu + Han, Kilian Q. Weinberger, Yoav Artzi. +66. :doc:`SEW-D ` (from ASAPP) released with the paper `Performance-Efficiency Trade-offs in + Unsupervised Pre-training for Speech Recognition `__ by Felix Wu, Kwangyoun Kim, + Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. +67. :doc:`SpeechToTextTransformer ` (from Facebook), released together with the paper + `fairseq S2T: Fast Speech-to-Text Modeling with fairseq `__ by Changhan Wang, Yun + Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino. +68. :doc:`SpeechToTextTransformer2 ` (from Facebook), released together with the paper + `Large-Scale Self- and Semi-Supervised Learning for Speech Translation `__ by + Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau. +69. :doc:`Splinter ` (from Tel Aviv University), released together with the paper `Few-Shot + Question Answering by Pretraining Span Selection `__ by Ori Ram, Yuval Kirstain, + Jonathan Berant, Amir Globerson, Omer Levy. +70. :doc:`SqueezeBert ` (from Berkeley) released with the paper `SqueezeBERT: What can computer + vision teach NLP about efficient neural networks? `__ by Forrest N. Iandola, + Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer. +71. :doc:`T5 ` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a + Unified Text-to-Text Transformer `__ by Colin Raffel and Noam Shazeer and Adam + Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. +72. :doc:`T5v1.1 ` (from Google AI) released in the repository + `google-research/text-to-text-transfer-transformer + `__ by + Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi + Zhou and Wei Li and Peter J. Liu. +73. :doc:`TAPAS ` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via + Pre-training `__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, + Francesco Piccinno and Julian Martin Eisenschlos. +74. :doc:`Transformer-XL ` (from Google/CMU) released with the paper `Transformer-XL: + Attentive Language Models Beyond a Fixed-Length Context `__ by Zihang Dai*, + Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov. +75. :doc:`TrOCR ` (from Microsoft), released together with the paper `TrOCR: Transformer-based Optical + Character Recognition with Pre-trained Models `__ by Minghao Li, Tengchao Lv, Lei + Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei. +76. :doc:`UniSpeech ` (from Microsoft Research) released with the paper `UniSpeech: Unified Speech + Representation Learning with Labeled and Unlabeled Data `__ by Chengyi Wang, Yu + Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang. +77. :doc:`UniSpeechSat ` (from Microsoft Research) released with the paper `UNISPEECH-SAT: + UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING `__ by + Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, + Xiangzhan Yu. +78. :doc:`Vision Transformer (ViT) ` (from Google AI) released with the paper `An Image is Worth 16x16 + Words: Transformers for Image Recognition at Scale `__ by Alexey Dosovitskiy, + Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias + Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. +79. :doc:`VisualBERT ` (from UCLA NLP) released with the paper `VisualBERT: A Simple and + Performant Baseline for Vision and Language `__ by Liunian Harold Li, Mark + Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang. +80. :doc:`Wav2Vec2 ` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for + Self-Supervised Learning of Speech Representations `__ by Alexei Baevski, Henry + Zhou, Abdelrahman Mohamed, Michael Auli. +81. :doc:`XLM ` (from Facebook) released together with the paper `Cross-lingual Language Model + Pretraining `__ by Guillaume Lample and Alexis Conneau. +82. :doc:`XLM-ProphetNet ` (from Microsoft Research) released with the paper `ProphetNet: + Predicting Future N-gram for Sequence-to-Sequence Pre-training `__ by Yu Yan, + Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. +83. :doc:`XLM-RoBERTa ` (from Facebook AI), released together with the paper `Unsupervised + Cross-lingual Representation Learning at Scale `__ by Alexis Conneau*, Kartikay + Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke + Zettlemoyer and Veselin Stoyanov. +84. :doc:`XLNet ` (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive + Pretraining for Language Understanding `__ by Zhilin Yang*, Zihang Dai*, Yiming + Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le. +85. :doc:`XLSR-Wav2Vec2 ` (from Facebook AI) released with the paper `Unsupervised + Cross-Lingual Representation Learning For Speech Recognition `__ by Alexis + Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli. + + +Supported frameworks +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The table below represents the current support in the library for each of those models, whether they have a Python +tokenizer (called "slow"). A "fast" tokenizer backed by the 🤗 Tokenizers library, whether they have support in Jax (via +Flax), PyTorch, and/or TensorFlow. + +.. + This table is updated automatically from the auto modules with `make fix-copies`. Do not update manually! + +.. rst-class:: center-aligned-table + ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| Model | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support | ++=============================+================+================+=================+====================+==============+ +| ALBERT | ✅ | ✅ | ✅ | ✅ | ✅ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| BART | ✅ | ✅ | ✅ | ✅ | ✅ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| BEiT | ❌ | ❌ | ✅ | ❌ | ✅ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| BERT | ✅ | ✅ | ✅ | ✅ | ✅ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| Bert Generation | ✅ | ❌ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| BigBird | ✅ | ✅ | ✅ | ❌ | ✅ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| BigBirdPegasus | ❌ | ❌ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| Blenderbot | ✅ | ✅ | ✅ | ✅ | ✅ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| BlenderbotSmall | ✅ | ✅ | ✅ | ✅ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| CamemBERT | ✅ | ✅ | ✅ | ✅ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| Canine | ✅ | ❌ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| CLIP | ✅ | ✅ | ✅ | ❌ | ✅ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| ConvBERT | ✅ | ✅ | ✅ | ✅ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| CTRL | ✅ | ❌ | ✅ | ✅ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| DeBERTa | ✅ | ✅ | ✅ | ✅ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| DeBERTa-v2 | ✅ | ❌ | ✅ | ✅ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| DeiT | ❌ | ❌ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| DETR | ❌ | ❌ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| DistilBERT | ✅ | ✅ | ✅ | ✅ | ✅ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| DPR | ✅ | ✅ | ✅ | ✅ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| ELECTRA | ✅ | ✅ | ✅ | ✅ | ✅ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| FairSeq Machine-Translation | ✅ | ❌ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| FlauBERT | ✅ | ❌ | ✅ | ✅ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| FNet | ✅ | ✅ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| Funnel Transformer | ✅ | ✅ | ✅ | ✅ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| GPT Neo | ❌ | ❌ | ✅ | ❌ | ✅ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| GPT-J | ❌ | ❌ | ✅ | ❌ | ✅ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| Hubert | ❌ | ❌ | ✅ | ✅ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| I-BERT | ❌ | ❌ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| ImageGPT | ❌ | ❌ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| LayoutLM | ✅ | ✅ | ✅ | ✅ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| LayoutLMv2 | ✅ | ✅ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| LED | ✅ | ✅ | ✅ | ✅ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| Longformer | ✅ | ✅ | ✅ | ✅ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| LUKE | ✅ | ❌ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| LXMERT | ✅ | ✅ | ✅ | ✅ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| M2M100 | ✅ | ❌ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| Marian | ✅ | ❌ | ✅ | ✅ | ✅ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| mBART | ✅ | ✅ | ✅ | ✅ | ✅ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| MegatronBert | ❌ | ❌ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| MobileBERT | ✅ | ✅ | ✅ | ✅ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| MPNet | ✅ | ✅ | ✅ | ✅ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| mT5 | ✅ | ✅ | ✅ | ✅ | ✅ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| OpenAI GPT | ✅ | ✅ | ✅ | ✅ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| OpenAI GPT-2 | ✅ | ✅ | ✅ | ✅ | ✅ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| Pegasus | ✅ | ✅ | ✅ | ✅ | ✅ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| Perceiver | ✅ | ❌ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| ProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| QDQBert | ❌ | ❌ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| RAG | ✅ | ❌ | ✅ | ✅ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| Reformer | ✅ | ✅ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| RemBERT | ✅ | ✅ | ✅ | ✅ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| RetriBERT | ✅ | ✅ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| RoFormer | ✅ | ✅ | ✅ | ✅ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| SegFormer | ❌ | ❌ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| SEW | ❌ | ❌ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| SEW-D | ❌ | ❌ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| Speech Encoder decoder | ❌ | ❌ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| Speech2Text | ✅ | ❌ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| Speech2Text2 | ✅ | ❌ | ❌ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| Splinter | ✅ | ✅ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| SqueezeBERT | ✅ | ✅ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| T5 | ✅ | ✅ | ✅ | ✅ | ✅ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| TAPAS | ✅ | ❌ | ✅ | ✅ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| Transformer-XL | ✅ | ❌ | ✅ | ✅ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| TrOCR | ❌ | ❌ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| UniSpeech | ❌ | ❌ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| UniSpeechSat | ❌ | ❌ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| Vision Encoder decoder | ❌ | ❌ | ✅ | ❌ | ✅ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| VisionTextDualEncoder | ❌ | ❌ | ✅ | ❌ | ✅ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| VisualBert | ❌ | ❌ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| ViT | ❌ | ❌ | ✅ | ✅ | ✅ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| Wav2Vec2 | ✅ | ❌ | ✅ | ✅ | ✅ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| XLM | ✅ | ❌ | ✅ | ✅ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| XLM-RoBERTa | ✅ | ✅ | ✅ | ✅ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| XLMProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| XLNet | ✅ | ✅ | ✅ | ✅ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ + +.. toctree:: + :maxdepth: 2 + :caption: Get started + + quicktour + installation + philosophy + glossary + +.. toctree:: + :maxdepth: 2 + :caption: Using 🤗 Transformers + + task_summary + model_summary + preprocessing + training + model_sharing + tokenizer_summary + multilingual + +.. toctree:: + :maxdepth: 2 + :caption: Advanced guides + + pretrained_models + examples + troubleshooting + custom_datasets + notebooks + sagemaker + community + converting_tensorflow_models + migration + contributing + add_new_model + add_new_pipeline + fast_tokenizers + performance + parallelism + testing + debugging + serialization + pr_checks + +.. toctree:: + :maxdepth: 2 + :caption: Research + + bertology + perplexity + benchmarks + +.. toctree:: + :maxdepth: 2 + :caption: Main Classes + + main_classes/callback + main_classes/configuration + main_classes/data_collator + main_classes/keras_callbacks + main_classes/logging + main_classes/model + main_classes/optimizer_schedules + main_classes/output + main_classes/pipelines + main_classes/processors + main_classes/tokenizer + main_classes/trainer + main_classes/deepspeed + main_classes/feature_extractor + +.. toctree:: + :maxdepth: 2 + :caption: Models + + model_doc/albert + model_doc/auto + model_doc/bart + model_doc/barthez + model_doc/bartpho + model_doc/beit + model_doc/bert + model_doc/bertweet + model_doc/bertgeneration + model_doc/bert_japanese + model_doc/bigbird + model_doc/bigbird_pegasus + model_doc/blenderbot + model_doc/blenderbot_small + model_doc/bort + model_doc/byt5 + model_doc/camembert + model_doc/canine + model_doc/clip + model_doc/convbert + model_doc/cpm + model_doc/ctrl + model_doc/deberta + model_doc/deberta_v2 + model_doc/deit + model_doc/detr + model_doc/dialogpt + model_doc/distilbert + model_doc/dpr + model_doc/electra + model_doc/encoderdecoder + model_doc/flaubert + model_doc/fnet + model_doc/fsmt + model_doc/funnel + model_doc/herbert + model_doc/ibert + model_doc/imagegpt + model_doc/layoutlm + model_doc/layoutlmv2 + model_doc/layoutxlm + model_doc/led + model_doc/longformer + model_doc/luke + model_doc/lxmert + model_doc/marian + model_doc/m2m_100 + model_doc/mbart + model_doc/megatron_bert + model_doc/megatron_gpt2 + model_doc/mobilebert + model_doc/mpnet + model_doc/mt5 + model_doc/gpt + model_doc/gpt2 + model_doc/gptj + model_doc/gpt_neo + model_doc/hubert + model_doc/pegasus + model_doc/perceiver + model_doc/phobert + model_doc/prophetnet + model_doc/qdqbert + model_doc/rag + model_doc/reformer + model_doc/rembert + model_doc/retribert + model_doc/roberta + model_doc/roformer + model_doc/segformer + model_doc/sew + model_doc/sew_d + model_doc/speechencoderdecoder + model_doc/speech_to_text + model_doc/speech_to_text_2 + model_doc/splinter + model_doc/squeezebert + model_doc/t5 + model_doc/t5v1.1 + model_doc/tapas + model_doc/transformerxl + model_doc/trocr + model_doc/unispeech + model_doc/unispeech_sat + model_doc/visionencoderdecoder + model_doc/vision_text_dual_encoder + model_doc/vit + model_doc/visual_bert + model_doc/wav2vec2 + model_doc/xlm + model_doc/xlmprophetnet + model_doc/xlmroberta + model_doc/xlnet + model_doc/xlsr_wav2vec2 + +.. toctree:: + :maxdepth: 2 + :caption: Internal Helpers + + internal/modeling_utils + internal/pipelines_utils + internal/tokenization_utils + internal/trainer_utils + internal/generation_utils + internal/file_utils diff --git a/docs/source/model_doc/perceiver.rst b/docs/source/model_doc/perceiver.rst new file mode 100644 index 0000000000..50daae6cdf --- /dev/null +++ b/docs/source/model_doc/perceiver.rst @@ -0,0 +1,234 @@ +.. + Copyright 2021 The HuggingFace Team. All rights reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + specific language governing permissions and limitations under the License. + +Perceiver +----------------------------------------------------------------------------------------------------------------------- + +Overview +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The Perceiver IO model was proposed in `Perceiver IO: A General Architecture for Structured Inputs & Outputs +`__ by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, +Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. +Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. + +Perceiver IO is a generalization of `Perceiver `__ to handle arbitrary outputs in +addition to arbitrary inputs. The original Perceiver only produced a single classification label. In addition to +classification labels, Perceiver IO can produce (for example) language, optical flow, and multimodal videos with audio. +This is done using the same building blocks as the original Perceiver. The computational complexity of Perceiver IO is +linear in the input and output size and the bulk of the processing occurs in the latent space, allowing us to process +inputs and outputs that are much larger than can be handled by standard Transformers. This means, for example, +Perceiver IO can do BERT-style masked language modeling directly using bytes instead of tokenized inputs. + +The abstract from the paper is the following: + +*The recently-proposed Perceiver model obtains good results on several domains (images, audio, multimodal, point +clouds) while scaling linearly in compute and memory with the input size. While the Perceiver supports many kinds of +inputs, it can only produce very simple outputs such as class scores. Perceiver IO overcomes this limitation without +sacrificing the original's appealing properties by learning to flexibly query the model's latent space to produce +outputs of arbitrary size and semantics. Perceiver IO still decouples model depth from data size and still scales +linearly with data size, but now with respect to both input and output sizes. The full Perceiver IO model achieves +strong results on tasks with highly structured output spaces, such as natural language and visual understanding, +StarCraft II, and multi-task and multi-modal domains. As highlights, Perceiver IO matches a Transformer-based BERT +baseline on the GLUE language benchmark without the need for input tokenization and achieves state-of-the-art +performance on Sintel optical flow estimation.* + +Here's a TLDR explaining how Perceiver works: + +The main problem with the self-attention mechanism of the Transformer is that the time and memory requirements scale +quadratically with the sequence length. Hence, models like BERT and RoBERTa are limited to a max sequence length of 512 +tokens. Perceiver aims to solve this issue by, instead of performing self-attention on the inputs, perform it on a set +of latent variables, and only use the inputs for cross-attention. In this way, the time and memory requirements don't +depend on the length of the inputs anymore, as one uses a fixed amount of latent variables, like 256 or 512. These are +randomly initialized, after which they are trained end-to-end using backpropagation. + +Internally, :class:`~transformers.PerceiverModel` will create the latents, which is a tensor of shape +:obj:`(batch_size, num_latents, d_latents)`. One must provide :obj:`inputs` (which could be text, images, audio, you +name it!) to the model, which it will use to perform cross-attention with the latents. The output of the Perceiver +encoder is a tensor of the same shape. One can then, similar to BERT, convert the last hidden states of the latents to +classification logits by averaging along the sequence dimension, and placing a linear layer on top of that to project +the :obj:`d_latents` to :obj:`num_labels`. + +This was the idea of the original Perceiver paper. However, it could only output classification logits. In a follow-up +work, PerceiverIO, they generalized it to let the model also produce outputs of arbitrary size. How, you might ask? The +idea is actually relatively simple: one defines outputs of an arbitrary size, and then applies cross-attention with the +last hidden states of the latents, using the outputs as queries, and the latents as keys and values. + +So let's say one wants to perform masked language modeling (BERT-style) with the Perceiver. As the Perceiver's input +length will not have an impact on the computation time of the self-attention layers, one can provide raw bytes, +providing :obj:`inputs` of length 2048 to the model. If one now masks out certain of these 2048 tokens, one can define +the :obj:`outputs` as being of shape: :obj:`(batch_size, 2048, 768)`. Next, one performs cross-attention with the final +hidden states of the latents to update the :obj:`outputs` tensor. After cross-attention, one still has a tensor of +shape :obj:`(batch_size, 2048, 768)`. One can then place a regular language modeling head on top, to project the last +dimension to the vocabulary size of the model, i.e. creating logits of shape :obj:`(batch_size, 2048, 262)` (as +Perceiver uses a vocabulary size of 262 byte IDs). + + +This model was contributed by ` `__. The original code can be found `here +`__. + + +Perceiver specific outputs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.models.perceiver.modeling_perceiver.PerceiverModelOutput + :members: + +.. autoclass:: transformers.models.perceiver.modeling_perceiver.PerceiverDecoderOutput + :members: + +.. autoclass:: transformers.models.perceiver.modeling_perceiver.PerceiverMaskedLMOutput + :members: + +.. autoclass:: transformers.models.perceiver.modeling_perceiver.PerceiverClassifierOutput + :members: + + +PerceiverConfig +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.PerceiverConfig + :members: + + +PerceiverTokenizer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.PerceiverTokenizer + :members: build_inputs_with_special_tokens, get_special_tokens_mask, + create_token_type_ids_from_sequences, save_vocabulary + + +PerceiverFeatureExtractor +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.PerceiverFeatureExtractor + :members: + + +PerceiverTextPreprocessor +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.models.perceiver.modeling_perceiver.PerceiverTextPreprocessor + :members: + + +PerceiverImagePreprocessor +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.models.perceiver.modeling_perceiver.PerceiverImagePreprocessor + :members: + + +PerceiverOneHotPreprocessor +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.models.perceiver.modeling_perceiver.PerceiverOneHotPreprocessor + :members: + + +PerceiverAudioPreprocessor +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.models.perceiver.modeling_perceiver.PerceiverAudioPreprocessor + :members: + + +PerceiverMultimodalPreprocessor +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.models.perceiver.modeling_perceiver.PerceiverMultimodalPreprocessor + :members: + + +PerceiverProjectionPostprocessor +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.models.perceiver.modeling_perceiver.PerceiverProjectionPostprocessor + :members: + + +PerceiverAudioPostprocessor +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.models.perceiver.modeling_perceiver.PerceiverAudioPostprocessor + :members: + + +PerceiverClassificationPostprocessor +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.models.perceiver.modeling_perceiver.PerceiverClassificationPostprocessor + :members: + + +PerceiverMultimodalPostprocessor +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.models.perceiver.modeling_perceiver.PerceiverMultimodalPostprocessor + :members: + + +PerceiverModel +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.PerceiverModel + :members: forward + + +PerceiverForMaskedLM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.PerceiverForMaskedLM + :members: forward + + +PerceiverForSequenceClassification +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.PerceiverForSequenceClassification + :members: forward + + +PerceiverForImageClassificationLearned +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.PerceiverForImageClassificationLearned + :members: forward + + +PerceiverForImageClassificationFourier +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.PerceiverForImageClassificationFourier + :members: forward + + +PerceiverForImageClassificationConvProcessing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.PerceiverForImageClassificationConvProcessing + :members: forward + + +PerceiverForOpticalFlow +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.PerceiverForOpticalFlow + :members: forward + + +PerceiverForMultimodalAutoencoding +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.PerceiverForMultimodalAutoencoding + :members: forward diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index cface3ab3c..efa01a89f6 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -253,6 +253,7 @@ _import_structure = { "models.mt5": ["MT5Config"], "models.openai": ["OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "OpenAIGPTConfig", "OpenAIGPTTokenizer"], "models.pegasus": ["PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP", "PegasusConfig", "PegasusTokenizer"], + "models.perceiver": ["PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP", "PerceiverConfig", "PerceiverTokenizer"], "models.phobert": ["PhobertTokenizer"], "models.prophetnet": ["PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "ProphetNetConfig", "ProphetNetTokenizer"], "models.qdqbert": ["QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "QDQBertConfig"], @@ -502,6 +503,7 @@ if is_vision_available(): _import_structure["models.layoutlmv2"].append("LayoutLMv2FeatureExtractor") _import_structure["models.layoutlmv2"].append("LayoutLMv2Processor") _import_structure["models.layoutxlm"].append("LayoutXLMProcessor") + _import_structure["models.perceiver"].append("PerceiverFeatureExtractor") _import_structure["models.segformer"].append("SegformerFeatureExtractor") _import_structure["models.vit"].append("ViTFeatureExtractor") else: @@ -1144,6 +1146,21 @@ if is_torch_available(): _import_structure["models.pegasus"].extend( ["PegasusForCausalLM", "PegasusForConditionalGeneration", "PegasusModel", "PegasusPreTrainedModel"] ) + _import_structure["models.perceiver"].extend( + [ + "PERCEIVER_PRETRAINED_MODEL_ARCHIVE_LIST", + "PerceiverForImageClassificationConvProcessing", + "PerceiverForImageClassificationFourier", + "PerceiverForImageClassificationLearned", + "PerceiverForMaskedLM", + "PerceiverForMultimodalAutoencoding", + "PerceiverForOpticalFlow", + "PerceiverForSequenceClassification", + "PerceiverLayer", + "PerceiverModel", + "PerceiverPreTrainedModel", + ] + ) _import_structure["models.prophetnet"].extend( [ "PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -2263,6 +2280,7 @@ if TYPE_CHECKING: from .models.mt5 import MT5Config from .models.openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig, OpenAIGPTTokenizer from .models.pegasus import PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP, PegasusConfig, PegasusTokenizer + from .models.perceiver import PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP, PerceiverConfig, PerceiverTokenizer from .models.phobert import PhobertTokenizer from .models.prophetnet import PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ProphetNetConfig, ProphetNetTokenizer from .models.qdqbert import QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, QDQBertConfig @@ -2470,6 +2488,7 @@ if TYPE_CHECKING: from .models.imagegpt import ImageGPTFeatureExtractor from .models.layoutlmv2 import LayoutLMv2FeatureExtractor, LayoutLMv2Processor from .models.layoutxlm import LayoutXLMProcessor + from .models.perceiver import PerceiverFeatureExtractor from .models.segformer import SegformerFeatureExtractor from .models.vit import ViTFeatureExtractor else: @@ -3006,6 +3025,19 @@ if TYPE_CHECKING: PegasusModel, PegasusPreTrainedModel, ) + from .models.perceiver import ( + PERCEIVER_PRETRAINED_MODEL_ARCHIVE_LIST, + PerceiverForImageClassificationConvProcessing, + PerceiverForImageClassificationFourier, + PerceiverForImageClassificationLearned, + PerceiverForMaskedLM, + PerceiverForMultimodalAutoencoding, + PerceiverForOpticalFlow, + PerceiverForSequenceClassification, + PerceiverLayer, + PerceiverModel, + PerceiverPreTrainedModel, + ) from .models.prophetnet import ( PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST, ProphetNetDecoder, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 61a6bebed1..5095d19d91 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -78,6 +78,7 @@ from . import ( mt5, openai, pegasus, + perceiver, phobert, prophetnet, qdqbert, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 8d1293042e..ad2712f467 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -37,6 +37,7 @@ CONFIG_MAPPING_NAMES = OrderedDict( ("fnet", "FNetConfig"), ("segformer", "SegformerConfig"), ("vision-text-dual-encoder", "VisionTextDualEncoderConfig"), + ("perceiver", "PerceiverConfig"), ("gptj", "GPTJConfig"), ("layoutlmv2", "LayoutLMv2Config"), ("beit", "BeitConfig"), @@ -119,6 +120,7 @@ CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict( ("fnet", "FNET_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("pegasus", "PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("segformer", "SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("perceiver", "PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("gptj", "GPTJ_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("layoutlmv2", "LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("beit", "BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -194,6 +196,7 @@ MODEL_NAMES_MAPPING = OrderedDict( ("fnet", "FNet"), ("segformer", "SegFormer"), ("vision-text-dual-encoder", "VisionTextDualEncoder"), + ("perceiver", "Perceiver"), ("gptj", "GPT-J"), ("beit", "BEiT"), ("rembert", "RemBERT"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 97b1093c14..7fca274e3e 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -33,6 +33,7 @@ MODEL_MAPPING_NAMES = OrderedDict( ("fnet", "FNetModel"), ("segformer", "SegformerModel"), ("vision-text-dual-encoder", "VisionTextDualEncoderModel"), + ("perceiver", "PerceiverModel"), ("gptj", "GPTJModel"), ("layoutlmv2", "LayoutLMv2Model"), ("beit", "BeitModel"), @@ -247,6 +248,14 @@ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict( ("beit", "BeitForImageClassification"), ("segformer", "SegformerForImageClassification"), ("imagegpt", "ImageGPTForImageClassification"), + ( + "perceiver", + ( + "PerceiverForImageClassificationLearned", + "PerceiverForImageClassificationFourier", + "PerceiverForImageClassificationConvProcessing", + ), + ), ] ) @@ -266,6 +275,7 @@ MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict( MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict( [ # Model for Masked LM mapping + ("perceiver", "PerceiverForMaskedLM"), ("qdqbert", "QDQBertForMaskedLM"), ("fnet", "FNetForMaskedLM"), ("rembert", "RemBertForMaskedLM"), @@ -337,6 +347,7 @@ MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict( MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict( [ # Model for Sequence Classification mapping + ("perceiver", "PerceiverForSequenceClassification"), ("qdqbert", "QDQBertForSequenceClassification"), ("fnet", "FNetForSequenceClassification"), ("gptj", "GPTJForSequenceClassification"), diff --git a/src/transformers/models/perceiver/__init__.py b/src/transformers/models/perceiver/__init__.py new file mode 100644 index 0000000000..f066c6cacf --- /dev/null +++ b/src/transformers/models/perceiver/__init__.py @@ -0,0 +1,72 @@ +# flake8: noqa +# There's no way to ignore "F401 '...' imported but unused" warnings in this +# module, but to preserve other warnings. So, don't check this module at all. + +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available, is_vision_available + + +_import_structure = { + "configuration_perceiver": ["PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP", "PerceiverConfig"], + "tokenization_perceiver": ["PerceiverTokenizer"], +} + +if is_vision_available(): + _import_structure["feature_extraction_perceiver"] = ["PerceiverFeatureExtractor"] + +if is_torch_available(): + _import_structure["modeling_perceiver"] = [ + "PERCEIVER_PRETRAINED_MODEL_ARCHIVE_LIST", + "PerceiverForImageClassificationConvProcessing", + "PerceiverForImageClassificationFourier", + "PerceiverForImageClassificationLearned", + "PerceiverForMaskedLM", + "PerceiverForMultimodalAutoencoding", + "PerceiverForOpticalFlow", + "PerceiverForSequenceClassification", + "PerceiverLayer", + "PerceiverModel", + "PerceiverPreTrainedModel", + ] + + +if TYPE_CHECKING: + from .configuration_perceiver import PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP, PerceiverConfig + from .tokenization_perceiver import PerceiverTokenizer + + if is_vision_available(): + from .feature_extraction_perceiver import PerceiverFeatureExtractor + + if is_torch_available(): + from .modeling_perceiver import ( + PERCEIVER_PRETRAINED_MODEL_ARCHIVE_LIST, + PerceiverForImageClassificationConvProcessing, + PerceiverForImageClassificationFourier, + PerceiverForImageClassificationLearned, + PerceiverForMaskedLM, + PerceiverForMultimodalAutoencoding, + PerceiverForOpticalFlow, + PerceiverForSequenceClassification, + PerceiverLayer, + PerceiverModel, + PerceiverPreTrainedModel, + ) + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure) diff --git a/src/transformers/models/perceiver/configuration_perceiver.py b/src/transformers/models/perceiver/configuration_perceiver.py new file mode 100644 index 0000000000..e370ee3b51 --- /dev/null +++ b/src/transformers/models/perceiver/configuration_perceiver.py @@ -0,0 +1,171 @@ +# coding=utf-8 +# Copyright Deepmind and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Perceiver model configuration """ + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + +PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "deepmind/language-perceiver": "https://huggingface.co/deepmind/language-perceiver/resolve/main/config.json", + # See all Perceiver models at https://huggingface.co/models?filter=perceiver +} + + +class PerceiverConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a :class:`~transformers.PerceiverModel`. It is used + to instantiate an Perceiver model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the Perceiver + `deepmind/language-perceiver `__ architecture. + + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model + outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. + + Args: + num_latents (:obj:`int`, `optional`, defaults to 256): + The number of latents. + d_latents (:obj:`int`, `optional`, defaults to 1280): + Dimension of the latent embeddings. + d_model (:obj:`int`, `optional`, defaults to 768): + Dimension of the inputs. + num_blocks (:obj:`int`, `optional`, defaults to 1): + Number of blocks in the Transformer encoder. + num_self_attends_per_block (:obj:`int`, `optional`, defaults to 26): + The number of self-attention layers per block. + num_self_attention_heads (:obj:`int`, `optional`, defaults to 8): + Number of attention heads for each self-attention layer in the Transformer encoder. + num_cross_attention_heads (:obj:`int`, `optional`, defaults to 8): + Number of attention heads for each cross-attention layer in the Transformer encoder. + qk_channels (:obj:`int`, `optional`): + Dimension to project the queries + keys before applying attention in the cross-attention and self-attention + layers of the encoder. Will default to preserving the dimension of the queries if not specified. + v_channels (:obj:`int`, `optional`): + Dimension to project the values before applying attention in the cross-attention and self-attention layers + of the encoder. Will default to preserving the dimension of the queries if not specified. + cross_attention_shape_for_attention (:obj:`str`, `optional`, defaults to :obj:`'kv'`): + Dimension to use when downsampling the queries and keys in the cross-attention layer of the encoder. + self_attention_widening_factor (:obj:`int`, `optional`, defaults to 1): + Dimension of the feed-forward layer in the cross-attention layer of the Transformer encoder. + cross_attention_widening_factor (:obj:`int`, `optional`, defaults to 1): + Dimension of the feed-forward layer in the self-attention layers of the Transformer encoder. + hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported. + attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): + The dropout ratio for the attention probabilities. + initializer_range (:obj:`float`, `optional`, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12): + The epsilon used by the layer normalization layers. + use_query_residual (:obj:`float`, `optional`, defaults to :obj:`True`): + Whether to add a query residual in the cross-attention layer of the encoder. + vocab_size (:obj:`int`, `optional`, defaults to 262): + Vocabulary size for the masked language modeling model. + max_position_embeddings (:obj:`int`, `optional`, defaults to 2048): + The maximum sequence length that the masked language modeling model might ever be used with. Typically set + this to something large just in case (e.g., 512 or 1024 or 2048). + image_size (:obj:`int`, `optional`, defaults to 56): + Size of the images after preprocessing, for :class:`~transformers.PerceiverForImageClassificationLearned`. + train_size (:obj:`List[int]`, `optional`, defaults to [368, 496]): + Training size of the images for the optical flow model. + num_frames (:obj:`int`, `optional`, defaults to 16): + Number of video frames used for the multimodal autoencoding model. + audio_samples_per_frame (:obj:`int`, `optional`, defaults to 1920): + Number of audio samples per frame for the multimodal autoencoding model. + samples_per_patch (:obj:`int`, `optional`, defaults to 16): + Number of audio samples per patch when preprocessing the audio for the multimodal autoencoding model. + output_shape (:obj:`List[int]`, `optional`, defaults to :obj:`[1, 16, 224, 224]`): + Shape of the output for the multimodal autoencoding model. + + Example:: + + >>> from transformers import PerceiverModel, PerceiverConfig + + >>> # Initializing a Perceiver deepmind/language-perceiver style configuration + >>> configuration = PerceiverConfig() + + >>> # Initializing a model from the deepmind/language-perceiver style configuration + >>> model = PerceiverModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + """ + model_type = "perceiver" + + def __init__( + self, + num_latents=256, + d_latents=1280, + d_model=768, + num_blocks=1, + num_self_attends_per_block=26, + num_self_attention_heads=8, + num_cross_attention_heads=8, + qk_channels=None, + v_channels=None, + cross_attention_shape_for_attention="kv", + self_attention_widening_factor=1, + cross_attention_widening_factor=1, + hidden_act="gelu", + attention_probs_dropout_prob=0.1, + position_embedding_init_scale=0.02, + initializer_range=0.02, + layer_norm_eps=1e-12, + is_encoder_decoder=False, + use_query_residual=True, + vocab_size=262, + max_position_embeddings=2048, + image_size=56, + train_size=[368, 496], + num_frames=16, + audio_samples_per_frame=1920, + samples_per_patch=16, + output_shape=[1, 16, 224, 224], + **kwargs + ): + super().__init__(**kwargs) + + self.num_latents = num_latents + self.d_latents = d_latents + self.d_model = d_model + self.num_blocks = num_blocks + self.num_self_attends_per_block = num_self_attends_per_block + self.num_self_attention_heads = num_self_attention_heads + self.num_cross_attention_heads = num_cross_attention_heads + self.qk_channels = qk_channels + self.v_channels = v_channels + self.cross_attention_shape_for_attention = cross_attention_shape_for_attention + self.self_attention_widening_factor = self_attention_widening_factor + self.cross_attention_widening_factor = cross_attention_widening_factor + self.hidden_act = hidden_act + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.use_query_residual = use_query_residual + # masked language modeling attributes + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + # image classification attributes + self.image_size = image_size + # flow attributes + self.train_size = train_size + # multimodal autoencoding attributes + self.num_frames = num_frames + self.audio_samples_per_frame = audio_samples_per_frame + self.samples_per_patch = samples_per_patch + self.output_shape = output_shape diff --git a/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py b/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py new file mode 100644 index 0000000000..3969dc3c7a --- /dev/null +++ b/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py @@ -0,0 +1,468 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert Perceiver checkpoints originally implemented in Haiku.""" + + +import argparse +import json +import pickle +from pathlib import Path + +import numpy as np +import torch +from PIL import Image + +import haiku as hk +import requests +from huggingface_hub import cached_download, hf_hub_url +from transformers import ( + PerceiverConfig, + PerceiverFeatureExtractor, + PerceiverForImageClassificationConvProcessing, + PerceiverForImageClassificationFourier, + PerceiverForImageClassificationLearned, + PerceiverForMaskedLM, + PerceiverForMultimodalAutoencoding, + PerceiverForOpticalFlow, + PerceiverTokenizer, +) +from transformers.utils import logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + + +def prepare_img(): + # We will verify our results on an image of a dog + url = "https://storage.googleapis.com/perceiver_io/dalmation.jpg" + im = Image.open(requests.get(url, stream=True).raw) + return im + + +def rename_keys(state_dict, architecture): + for name in list(state_dict): + param = state_dict.pop(name) + + # PREPROCESSORS + # rename text preprocessor embeddings (for MLM model) + name = name.replace("embed/embeddings", "input_preprocessor.embeddings.weight") + if name.startswith("trainable_position_encoding/pos_embs"): + name = name.replace( + "trainable_position_encoding/pos_embs", "input_preprocessor.position_embeddings.weight" + ) + + # rename image preprocessor embeddings (for image classification model with learned position embeddings) + name = name.replace("image_preprocessor/~/conv2_d/w", "input_preprocessor.convnet_1x1.weight") + name = name.replace("image_preprocessor/~/conv2_d/b", "input_preprocessor.convnet_1x1.bias") + name = name.replace( + "image_preprocessor/~_build_network_inputs/trainable_position_encoding/pos_embs", + "input_preprocessor.position_embeddings.position_embeddings", + ) + name = name.replace( + "image_preprocessor/~_build_network_inputs/position_encoding_projector/linear/w", + "input_preprocessor.positions_projection.weight", + ) + name = name.replace( + "image_preprocessor/~_build_network_inputs/position_encoding_projector/linear/b", + "input_preprocessor.positions_projection.bias", + ) + + # rename image preprocessor embeddings (for image classification model with conv processing) + if "counter" in name or "hidden" in name: + continue + name = name.replace( + "image_preprocessor/~/conv2_d_downsample/~/conv/w", "input_preprocessor.convnet.conv.weight" + ) + name = name.replace( + "image_preprocessor/~/conv2_d_downsample/~/batchnorm/offset", "input_preprocessor.convnet.batchnorm.bias" + ) + name = name.replace( + "image_preprocessor/~/conv2_d_downsample/~/batchnorm/scale", "input_preprocessor.convnet.batchnorm.weight" + ) + name = name.replace( + "image_preprocessor/~/conv2_d_downsample/~/batchnorm/~/mean_ema/average", + "input_preprocessor.convnet.batchnorm.running_mean", + ) + name = name.replace( + "image_preprocessor/~/conv2_d_downsample/~/batchnorm/~/var_ema/average", + "input_preprocessor.convnet.batchnorm.running_var", + ) + + # rename image preprocessor embeddings (for optical flow model) + name = name.replace("image_preprocessor/patches_linear/b", "input_preprocessor.conv_after_patches.bias") + name = name.replace("image_preprocessor/patches_linear/w", "input_preprocessor.conv_after_patches.weight") + + # rename multimodal preprocessor embeddings + name = name.replace("multimodal_preprocessor/audio_mask_token/pos_embs", "input_preprocessor.mask.audio") + name = name.replace("multimodal_preprocessor/audio_padding/pos_embs", "input_preprocessor.padding.audio") + name = name.replace("multimodal_preprocessor/image_mask_token/pos_embs", "input_preprocessor.mask.image") + name = name.replace("multimodal_preprocessor/image_padding/pos_embs", "input_preprocessor.padding.image") + name = name.replace("multimodal_preprocessor/label_mask_token/pos_embs", "input_preprocessor.mask.label") + name = name.replace("multimodal_preprocessor/label_padding/pos_embs", "input_preprocessor.padding.label") + + # DECODERS + # rename prefix of decoders + # multimodal autoencoding model + name = name.replace( + "multimodal_decoder/~/basic_decoder/cross_attention/", "decoder.decoder.decoding_cross_attention." + ) + name = name.replace("multimodal_decoder/~decoder_query/audio_padding/pos_embs", "decoder.padding.audio") + name = name.replace("multimodal_decoder/~decoder_query/image_padding/pos_embs", "decoder.padding.image") + name = name.replace("multimodal_decoder/~decoder_query/label_padding/pos_embs", "decoder.padding.label") + name = name.replace("multimodal_decoder/~/basic_decoder/output/b", "decoder.decoder.final_layer.bias") + name = name.replace("multimodal_decoder/~/basic_decoder/output/w", "decoder.decoder.final_layer.weight") + if architecture == "multimodal_autoencoding": + name = name.replace( + "classification_decoder/~/basic_decoder/~/trainable_position_encoding/pos_embs", + "decoder.modalities.label.decoder.output_position_encodings.position_embeddings", + ) + # flow model + name = name.replace( + "flow_decoder/~/basic_decoder/cross_attention/", "decoder.decoder.decoding_cross_attention." + ) + name = name.replace("flow_decoder/~/basic_decoder/output/w", "decoder.decoder.final_layer.weight") + name = name.replace("flow_decoder/~/basic_decoder/output/b", "decoder.decoder.final_layer.bias") + # image models + name = name.replace( + "classification_decoder/~/basic_decoder/~/trainable_position_encoding/pos_embs", + "decoder.decoder.output_position_encodings.position_embeddings", + ) + name = name.replace( + "basic_decoder/~/trainable_position_encoding/pos_embs", + "decoder.output_position_encodings.position_embeddings", + ) + name = name.replace( + "classification_decoder/~/basic_decoder/cross_attention/", "decoder.decoder.decoding_cross_attention." + ) + name = name.replace("classification_decoder/~/basic_decoder/output/b", "decoder.decoder.final_layer.bias") + name = name.replace("classification_decoder/~/basic_decoder/output/w", "decoder.decoder.final_layer.weight") + name = name = name.replace("classification_decoder/~/basic_decoder/~/", "decoder.decoder.") + name = name.replace("basic_decoder/cross_attention/", "decoder.decoding_cross_attention.") + name = name.replace("basic_decoder/~/", "decoder.") + + # POSTPROCESSORS + name = name.replace( + "projection_postprocessor/linear/b", "output_postprocessor.modalities.image.classifier.bias" + ) + name = name.replace( + "projection_postprocessor/linear/w", "output_postprocessor.modalities.image.classifier.weight" + ) + name = name.replace( + "classification_postprocessor/linear/b", "output_postprocessor.modalities.label.classifier.bias" + ) + name = name.replace( + "classification_postprocessor/linear/w", "output_postprocessor.modalities.label.classifier.weight" + ) + name = name.replace("audio_postprocessor/linear/b", "output_postprocessor.modalities.audio.classifier.bias") + name = name.replace("audio_postprocessor/linear/w", "output_postprocessor.modalities.audio.classifier.weight") + + # PERCEIVER MODEL + + # rename latent embeddings + name = name.replace("perceiver_encoder/~/trainable_position_encoding/pos_embs", "embeddings.latents") + # rename latent embeddings (for multimodal model) + name = name.replace("encoder/~/trainable_position_encoding/pos_embs", "embeddings.latents") + + # rename prefixes + if name.startswith("perceiver_encoder/~/"): + if "self_attention" in name: + suffix = "self_attends." + else: + suffix = "" + name = name.replace("perceiver_encoder/~/", "encoder." + suffix) + if name.startswith("encoder/~/"): + if "self_attention" in name: + suffix = "self_attends." + else: + suffix = "" + name = name.replace("encoder/~/", "encoder." + suffix) + # rename layernorm parameters + if "offset" in name: + name = name.replace("offset", "bias") + if "scale" in name: + name = name.replace("scale", "weight") + # in HuggingFace, the layernorm in between attention + MLP is just called "layernorm" + # rename layernorm in between attention + MLP of cross-attention + if "cross_attention" in name and "layer_norm_2" in name: + name = name.replace("layer_norm_2", "layernorm") + # rename layernorm in between attention + MLP of self-attention + if "self_attention" in name and "layer_norm_1" in name: + name = name.replace("layer_norm_1", "layernorm") + + # in HuggingFace, the layernorms for queries + keys are called "layernorm1" and "layernorm2" + if "cross_attention" in name and "layer_norm_1" in name: + name = name.replace("layer_norm_1", "attention.self.layernorm2") + if "cross_attention" in name and "layer_norm" in name: + name = name.replace("layer_norm", "attention.self.layernorm1") + if "self_attention" in name and "layer_norm" in name: + name = name.replace("layer_norm", "attention.self.layernorm1") + + # rename special characters by dots + name = name.replace("-", ".") + name = name.replace("/", ".") + # rename keys, queries, values and output of attention layers + if ("cross_attention" in name or "self_attention" in name) and "mlp" not in name: + if "linear.b" in name: + name = name.replace("linear.b", "self.query.bias") + if "linear.w" in name: + name = name.replace("linear.w", "self.query.weight") + if "linear_1.b" in name: + name = name.replace("linear_1.b", "self.key.bias") + if "linear_1.w" in name: + name = name.replace("linear_1.w", "self.key.weight") + if "linear_2.b" in name: + name = name.replace("linear_2.b", "self.value.bias") + if "linear_2.w" in name: + name = name.replace("linear_2.w", "self.value.weight") + if "linear_3.b" in name: + name = name.replace("linear_3.b", "output.dense.bias") + if "linear_3.w" in name: + name = name.replace("linear_3.w", "output.dense.weight") + if "self_attention_" in name: + name = name.replace("self_attention_", "") + if "self_attention" in name: + name = name.replace("self_attention", "0") + # rename dense layers of 2-layer MLP + if "mlp" in name: + if "linear.b" in name: + name = name.replace("linear.b", "dense1.bias") + if "linear.w" in name: + name = name.replace("linear.w", "dense1.weight") + if "linear_1.b" in name: + name = name.replace("linear_1.b", "dense2.bias") + if "linear_1.w" in name: + name = name.replace("linear_1.w", "dense2.weight") + + # finally, TRANSPOSE if kernel and not embedding layer, and set value + if name[-6:] == "weight" and "embeddings" not in name: + param = np.transpose(param) + + # if batchnorm, we need to squeeze it + if "batchnorm" in name: + param = np.squeeze(param) + + if "embedding_decoder" not in name: + state_dict["perceiver." + name] = torch.from_numpy(param) + else: + state_dict[name] = torch.from_numpy(param) + + +@torch.no_grad() +def convert_perceiver_checkpoint(pickle_file, pytorch_dump_folder_path, architecture="MLM"): + """ + Copy/paste/tweak model's weights to our Perceiver structure. + """ + + # load parameters as FlatMapping data structure + with open(pickle_file, "rb") as f: + checkpoint = pickle.loads(f.read()) + + state = None + if isinstance(checkpoint, dict) and architecture in [ + "image_classification", + "image_classification_fourier", + "image_classification_conv", + ]: + # the image classification_conv checkpoint also has batchnorm states (running_mean and running_var) + params = checkpoint["params"] + state = checkpoint["state"] + else: + params = checkpoint + + # turn into initial state dict + state_dict = dict() + for scope_name, parameters in hk.data_structures.to_mutable_dict(params).items(): + for param_name, param in parameters.items(): + state_dict[scope_name + "/" + param_name] = param + + if state is not None: + # add state variables + for scope_name, parameters in hk.data_structures.to_mutable_dict(state).items(): + for param_name, param in parameters.items(): + state_dict[scope_name + "/" + param_name] = param + + # rename keys + rename_keys(state_dict, architecture=architecture) + + # load HuggingFace model + config = PerceiverConfig() + subsampling = None + repo_id = "datasets/huggingface/label-files" + if architecture == "MLM": + config.qk_channels = 8 * 32 + config.v_channels = 1280 + model = PerceiverForMaskedLM(config) + elif "image_classification" in architecture: + config.num_latents = 512 + config.d_latents = 1024 + config.d_model = 512 + config.num_blocks = 8 + config.num_self_attends_per_block = 6 + config.num_cross_attention_heads = 1 + config.num_self_attention_heads = 8 + config.qk_channels = None + config.v_channels = None + # set labels + config.num_labels = 1000 + filename = "imagenet-1k-id2label.json" + id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename)), "r")) + id2label = {int(k): v for k, v in id2label.items()} + config.id2label = id2label + config.label2id = {v: k for k, v in id2label.items()} + if architecture == "image_classification": + config.image_size = 224 + model = PerceiverForImageClassificationLearned(config) + elif architecture == "image_classification_fourier": + config.d_model = 261 + model = PerceiverForImageClassificationFourier(config) + elif architecture == "image_classification_conv": + config.d_model = 322 + model = PerceiverForImageClassificationConvProcessing(config) + else: + raise ValueError(f"Architecture {architecture} not supported") + elif architecture == "optical_flow": + config.num_latents = 2048 + config.d_latents = 512 + config.d_model = 322 + config.num_blocks = 1 + config.num_self_attends_per_block = 24 + config.num_self_attention_heads = 16 + config.num_cross_attention_heads = 1 + model = PerceiverForOpticalFlow(config) + elif architecture == "multimodal_autoencoding": + config.num_latents = 28 * 28 * 1 + config.d_latents = 512 + config.d_model = 704 + config.num_blocks = 1 + config.num_self_attends_per_block = 8 + config.num_self_attention_heads = 8 + config.num_cross_attention_heads = 1 + config.num_labels = 700 + # define dummy inputs + subsampling (as each forward pass is only on a chunk of image + audio data) + images = torch.randn((1, 16, 3, 224, 224)) + audio = torch.randn((1, 30720, 1)) + nchunks = 128 + image_chunk_size = np.prod((16, 224, 224)) // nchunks + audio_chunk_size = audio.shape[1] // config.samples_per_patch // nchunks + # process the first chunk + chunk_idx = 0 + subsampling = { + "image": torch.arange(image_chunk_size * chunk_idx, image_chunk_size * (chunk_idx + 1)), + "audio": torch.arange(audio_chunk_size * chunk_idx, audio_chunk_size * (chunk_idx + 1)), + "label": None, + } + model = PerceiverForMultimodalAutoencoding(config) + # set labels + filename = "kinetics700-id2label.json" + id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename)), "r")) + id2label = {int(k): v for k, v in id2label.items()} + config.id2label = id2label + config.label2id = {v: k for k, v in id2label.items()} + else: + raise ValueError(f"Architecture {architecture} not supported") + model.eval() + + # load weights + model.load_state_dict(state_dict) + + # prepare dummy input + input_mask = None + if architecture == "MLM": + tokenizer = PerceiverTokenizer.from_pretrained("/Users/NielsRogge/Documents/Perceiver/Tokenizer files") + text = "This is an incomplete sentence where some words are missing." + encoding = tokenizer(text, padding="max_length", return_tensors="pt") + # mask " missing.". Note that the model performs much better if the masked chunk starts with a space. + encoding.input_ids[0, 51:60] = tokenizer.mask_token_id + inputs = encoding.input_ids + input_mask = encoding.attention_mask + elif architecture in ["image_classification", "image_classification_fourier", "image_classification_conv"]: + feature_extractor = PerceiverFeatureExtractor() + image = prepare_img() + encoding = feature_extractor(image, return_tensors="pt") + inputs = encoding.pixel_values + elif architecture == "optical_flow": + inputs = torch.randn(1, 2, 27, 368, 496) + elif architecture == "multimodal_autoencoding": + images = torch.randn((1, 16, 3, 224, 224)) + audio = torch.randn((1, 30720, 1)) + inputs = dict(image=images, audio=audio, label=torch.zeros((images.shape[0], 700))) + + # forward pass + if architecture == "multimodal_autoencoding": + outputs = model(inputs=inputs, attention_mask=input_mask, subsampled_output_points=subsampling) + else: + outputs = model(inputs=inputs, attention_mask=input_mask) + logits = outputs.logits + + # verify logits + if not isinstance(logits, dict): + print("Shape of logits:", logits.shape) + else: + for k, v in logits.items(): + print(f"Shape of logits of modality {k}", v.shape) + + if architecture == "MLM": + expected_slice = torch.tensor( + [[-11.8336, -11.6850, -11.8483], [-12.8149, -12.5863, -12.7904], [-12.8440, -12.6410, -12.8646]] + ) + assert torch.allclose(logits[0, :3, :3], expected_slice) + masked_tokens_predictions = logits[0, 51:60].argmax(dim=-1).tolist() + expected_list = [38, 115, 111, 121, 121, 111, 116, 109, 52] + assert masked_tokens_predictions == expected_list + print("Greedy predictions:") + print(masked_tokens_predictions) + print() + print("Predicted string:") + print(tokenizer.decode(masked_tokens_predictions)) + + elif architecture in ["image_classification", "image_classification_fourier", "image_classification_conv"]: + print("Predicted class:", model.config.id2label[logits.argmax(-1).item()]) + + # Finally, save files + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + print(f"Saving model to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--pickle_file", + type=str, + default=None, + required=True, + help="Path to local pickle file of a Perceiver checkpoint you'd like to convert.", + ) + parser.add_argument( + "--pytorch_dump_folder_path", + default=None, + type=str, + required=True, + help="Path to the output PyTorch model directory, provided as a string.", + ) + parser.add_argument( + "--architecture", + default="MLM", + type=str, + help=""" + Architecture, provided as a string. One of 'MLM', 'image_classification', image_classification_fourier', + image_classification_fourier', 'optical_flow' or 'multimodal_autoencoding'. + """, + ) + + args = parser.parse_args() + convert_perceiver_checkpoint(args.pickle_file, args.pytorch_dump_folder_path, args.architecture) diff --git a/src/transformers/models/perceiver/feature_extraction_perceiver.py b/src/transformers/models/perceiver/feature_extraction_perceiver.py new file mode 100644 index 0000000000..a15c7df204 --- /dev/null +++ b/src/transformers/models/perceiver/feature_extraction_perceiver.py @@ -0,0 +1,189 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Feature extractor class for Perceiver.""" + +from typing import Optional, Union + +import numpy as np +from PIL import Image + +from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin +from ...file_utils import TensorType +from ...image_utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + ImageFeatureExtractionMixin, + ImageInput, + is_torch_tensor, +) +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class PerceiverFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin): + r""" + Constructs a Perceiver feature extractor. + + This feature extractor inherits from :class:`~transformers.ImageFeatureExtractionMixin` which contains most of the + main methods. Users should refer to this superclass for more information regarding those methods. + + Args: + do_center_crop (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to crop the input at the center. If the input size is smaller than :obj:`crop_size` along any edge, + the image is padded with 0's and then center cropped. + crop_size (:obj:`int`, `optional`, defaults to 256): + Desired output size when applying center-cropping. Only has an effect if :obj:`do_center_crop` is set to + :obj:`True`. + do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to resize the input to a certain :obj:`size`. + size (:obj:`int` or :obj:`Tuple(int)`, `optional`, defaults to 224): + Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an + integer is provided, then the input will be resized to (size, size). Only has an effect if :obj:`do_resize` + is set to :obj:`True`. + resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BICUBIC`): + An optional resampling filter. This can be one of :obj:`PIL.Image.NEAREST`, :obj:`PIL.Image.BOX`, + :obj:`PIL.Image.BILINEAR`, :obj:`PIL.Image.HAMMING`, :obj:`PIL.Image.BICUBIC` or :obj:`PIL.Image.LANCZOS`. + Only has an effect if :obj:`do_resize` is set to :obj:`True`. + do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to normalize the input with :obj:`image_mean` and :obj:`image_std`. + image_mean (:obj:`List[int]`, defaults to :obj:`[0.485, 0.456, 0.406]`): + The sequence of means for each channel, to be used when normalizing images. + image_std (:obj:`List[int]`, defaults to :obj:`[0.229, 0.224, 0.225]`): + The sequence of standard deviations for each channel, to be used when normalizing images. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_center_crop=True, + crop_size=256, + do_resize=True, + size=224, + resample=Image.BICUBIC, + do_normalize=True, + image_mean=None, + image_std=None, + **kwargs + ): + super().__init__(**kwargs) + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD + + def center_crop(self, image): + """ + Crops :obj:`image` to `self.crop_size` using a center crop. Note that if the image is too small to be cropped + to the size given, it will be padded (so the returned result has the size asked). + + Args: + image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`): + The image to resize. + """ + + if isinstance(image, Image.Image): + image = self.to_numpy_array(image) + + image_height, image_width = image.shape[-2:] + + padded_center_crop_size = ( + (self.size / (self.crop_size)) * np.minimum(image_height, image_width).astype(np.float32) + ).astype(np.int32) + + offset_height = ((image_height - padded_center_crop_size) + 1) // 2 + offset_width = ((image_width - padded_center_crop_size) + 1) // 2 + crop_window = [offset_height, offset_width, padded_center_crop_size, padded_center_crop_size] + + image = image[ + :, crop_window[0] : crop_window[0] + crop_window[2], crop_window[1] : crop_window[1] + crop_window[3] + ] + + return image + + def __call__( + self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs + ) -> BatchFeature: + """ + Main method to prepare for the model one or several image(s). + + .. warning:: + + NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass + PIL images. + + Args: + images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a + number of channels, H and W are image height and width. + + return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`, defaults to :obj:`'np'`): + If set, will return tensors of a particular framework. Acceptable values are: + + * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. + * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects. + * :obj:`'np'`: Return NumPy :obj:`np.ndarray` objects. + * :obj:`'jax'`: Return JAX :obj:`jnp.ndarray` objects. + + Returns: + :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields: + + - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height, + width). + """ + # Input type checking for clearer error + valid_images = False + + # Check that images has a valid type + if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images): + valid_images = True + elif isinstance(images, (list, tuple)): + if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]): + valid_images = True + + if not valid_images: + raise ValueError( + "Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example)," + "`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)." + ) + + is_batched = bool( + isinstance(images, (list, tuple)) + and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0])) + ) + + if not is_batched: + images = [images] + + # transformations (center cropping + resizing + normalization) + if self.do_center_crop and self.crop_size is not None: + images = [self.center_crop(image) for image in images] + if self.do_resize and self.size is not None and self.resample is not None: + images = [self.resize(image=image, size=self.size, resample=self.resample) for image in images] + if self.do_normalize: + images = [self.normalize(image=image, mean=self.image_mean, std=self.image_std) for image in images] + + # return as BatchFeature + data = {"pixel_values": images} + encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors) + + return encoded_inputs diff --git a/src/transformers/models/perceiver/modeling_perceiver.py b/src/transformers/models/perceiver/modeling_perceiver.py new file mode 100755 index 0000000000..1e98f5fff2 --- /dev/null +++ b/src/transformers/models/perceiver/modeling_perceiver.py @@ -0,0 +1,3118 @@ +# coding=utf-8 +# Copyright 2021 Deepmind and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Perceiver model. """ + +import abc +import math +from dataclasses import dataclass +from functools import reduce +from operator import __add__ +from typing import Any, Callable, Mapping, Optional, Tuple + +import numpy as np +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...file_utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + replace_return_docstrings, +) +from ...modeling_outputs import BaseModelOutputWithCrossAttentions +from ...modeling_utils import ( + PreTrainedModel, + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + prune_linear_layer, +) +from ...utils import logging +from .configuration_perceiver import PerceiverConfig + + +ModalitySizeType = Mapping[str, int] +PreprocessorOutputType = Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor] +PreprocessorType = Callable[..., PreprocessorOutputType] +PostprocessorType = Callable[..., Any] + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "deepmind/language-perceiver" +_CONFIG_FOR_DOC = "PerceiverConfig" +_TOKENIZER_FOR_DOC = "PerceiverTokenizer" + +PERCEIVER_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "deepmind/language-perceiver", + # See all Perceiver models at https://huggingface.co/models?filter=perceiver +] + + +@dataclass +class PerceiverModelOutput(ModelOutput): + """ + Base class for Perceiver base model's outputs, with potential hidden states, attentions and cross-attentions. + + Args: + logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of + each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the + weighted average in the self-attention heads. + cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the + attention softmax, used to compute the weighted average in the cross-attention heads. + """ + + logits: torch.FloatTensor = None + last_hidden_state: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class PerceiverDecoderOutput(ModelOutput): + """ + Base class for Perceiver decoder outputs, with potential cross-attentions. + + Args: + logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_labels)`): + Output of the basic decoder. + cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the + attention softmax, used to compute the weighted average in the cross-attention heads. + """ + + logits: torch.FloatTensor = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class PerceiverMaskedLMOutput(ModelOutput): + """ + Base class for Perceiver's masked language model outputs. + + Args: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Masked language modeling (MLM) loss. + logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of + each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, num_latents, + num_latents)`. Attentions weights after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the + attention softmax, used to compute the weighted average in the cross-attention heads. + """ + + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class PerceiverClassifierOutput(ModelOutput): + """ + Base class for Perceiver's outputs of sequence/image classification models, optical flow and multimodal + autoencoding. + + Args: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of + each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the + weighted average in the self-attention heads. + cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the + attention softmax, used to compute the weighted average in the cross-attention heads. + """ + + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + + +class PerceiverEmbeddings(nn.Module): + """Construct the latent embeddings.""" + + def __init__(self, config): + super().__init__() + self.latents = nn.Parameter(torch.randn(config.num_latents, config.d_latents)) + + def forward(self, batch_size): + return self.latents.expand(batch_size, -1, -1) # Thanks, Phil Wang + + +class PerceiverSelfAttention(nn.Module): + """Multi-headed {cross, self}-attention. Can be used both in the encoder as well as in the decoder.""" + + def __init__( + self, + config, + is_cross_attention=False, + qk_channels=None, + v_channels=None, + num_heads=1, + q_dim=None, + kv_dim=None, + ): + super().__init__() + self.num_heads = num_heads + # Q and K must have the same number of channels. + # Default to preserving Q's input's shape. + if qk_channels is None: + qk_channels = q_dim + # V's num_channels determines the shape of the output of QKV-attention. + # Default to the same number of channels used in the key-query operation. + if v_channels is None: + v_channels = qk_channels + if qk_channels % num_heads != 0: + raise ValueError(f"qk_channels ({qk_channels}) must be divisible by num_heads ({num_heads}).") + if v_channels % num_heads != 0: + raise ValueError(f"v_channels ({v_channels}) must be divisible by num_heads ({num_heads}).") + + self.qk_channels = qk_channels + self.v_channels = v_channels + self.qk_channels_per_head = self.qk_channels // num_heads + self.v_channels_per_head = self.v_channels // num_heads + + # Layer normalization + self.layernorm1 = nn.LayerNorm(q_dim) + self.layernorm2 = nn.LayerNorm(kv_dim) if is_cross_attention else nn.Identity() + + # Projection matrices + self.query = nn.Linear(q_dim, qk_channels) + self.key = nn.Linear(kv_dim, qk_channels) + self.value = nn.Linear(kv_dim, v_channels) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x, channels_per_head): + new_x_shape = x.size()[:-1] + (self.num_heads, channels_per_head) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + inputs=None, + inputs_mask=None, + output_attentions=False, + ): + hidden_states = self.layernorm1(hidden_states) + inputs = self.layernorm2(inputs) + + # Project queries, keys and values to a common feature dimension. If this is instantiated as a cross-attention module, + # the keys and values come from the inputs; the attention mask needs to be such that the inputs's non-relevant tokens are not attended to. + is_cross_attention = inputs is not None + queries = self.query(hidden_states) + + if is_cross_attention: + keys = self.key(inputs) + values = self.value(inputs) + attention_mask = inputs_mask + else: + keys = self.key(hidden_states) + values = self.value(hidden_states) + + # Reshape channels for multi-head attention. + # We reshape from (batch_size, time, channels) to (batch_size, num_heads, time, channels per head) + queries = self.transpose_for_scores(queries, self.qk_channels_per_head) + keys = self.transpose_for_scores(keys, self.qk_channels_per_head) + values = self.transpose_for_scores(values, self.v_channels_per_head) + + # Take the dot product between the queries and keys to get the raw attention scores. + attention_scores = torch.matmul(queries, keys.transpose(-1, -2)) + + batch_size, num_heads, seq_len, q_head_dim = queries.shape + _, _, _, v_head_dim = values.shape + hiddens = self.num_heads * v_head_dim + + attention_scores = attention_scores / math.sqrt(q_head_dim) + + if attention_mask is not None: + # Apply the attention mask (precomputed for all layers in PerceiverModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, values) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (hiddens,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class PerceiverSelfOutput(nn.Module): + def __init__(self, config, input_channels, output_channels): + super().__init__() + self.dense = nn.Linear(input_channels, output_channels) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + return hidden_states + + +class PerceiverAttention(nn.Module): + """Attention module, including a dense block.""" + + def __init__( + self, + config, + is_cross_attention=False, + qk_channels=None, + v_channels=None, + num_heads=1, + q_dim=None, + kv_dim=None, + use_query_residual=True, + ): + super().__init__() + # MultiHead attention + if is_cross_attention and qk_channels is None: + if config.cross_attention_shape_for_attention == "q": + qk_channels = q_dim + elif config.cross_attention_shape_for_attention == "kv": + qk_channels = kv_dim + else: + raise ValueError( + f"Unknown value {config.cross_attention_shape_for_attention} for " + "cross_attention_shape_for_attention." + ) + else: + if qk_channels is None: + qk_channels = q_dim + if v_channels is None: + v_channels = qk_channels + self.self = PerceiverSelfAttention( + config, + is_cross_attention=is_cross_attention, + qk_channels=qk_channels, + v_channels=v_channels, + num_heads=num_heads, + q_dim=q_dim, + kv_dim=kv_dim, + ) + # dense block + output_channels = None + if is_cross_attention: + output_channels = q_dim + else: + if output_channels is None: + output_channels = v_channels + self.output = PerceiverSelfOutput(config, input_channels=self.self.v_channels, output_channels=output_channels) + self.use_query_residual = use_query_residual + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + inputs=None, + inputs_mask=None, + output_attentions=False, + ): + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + inputs, + inputs_mask, + output_attentions, + ) + + # Output projection + attention_output = self.output(self_outputs[0]) + + # Optionally include a residual to the original queries. + # Consider omitting the residual if the semantics of query and output + # are different, e.g. if queries are positions and outputs are pixels. + if self.use_query_residual: + attention_output = attention_output + hidden_states + + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class PerceiverMLP(nn.Module): + """A Transformer-style dense module to follow attention.""" + + def __init__(self, config, input_size, widening_factor): + super().__init__() + self.dense1 = nn.Linear(input_size, widening_factor * input_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + self.dense2 = nn.Linear(input_size, input_size) + + def forward(self, hidden_states): + hidden_states = self.dense1(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + hidden_states = self.dense2(hidden_states) + return hidden_states + + +class PerceiverLayer(nn.Module): + def __init__( + self, + config, + is_cross_attention=False, + qk_channels=None, + v_channels=None, + num_heads=1, + q_dim=None, + kv_dim=None, + widening_factor=4, + use_query_residual=True, + ): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = PerceiverAttention( + config, + is_cross_attention=is_cross_attention, + qk_channels=qk_channels, + v_channels=v_channels, + num_heads=num_heads, + q_dim=q_dim, + kv_dim=kv_dim, + use_query_residual=use_query_residual, + ) + self.layernorm = nn.LayerNorm(q_dim) + self.mlp = PerceiverMLP(config, input_size=q_dim, widening_factor=widening_factor) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + inputs=None, + inputs_mask=None, + output_attentions=False, + ): + attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + inputs, + inputs_mask, + output_attentions, + ) + attention_output = attention_outputs[0] + + outputs = attention_outputs[1:] # add attentions if we output attention weights + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + + layer_output = layer_output + attention_output # residual connection + + outputs = (layer_output,) + outputs + + return outputs + + def feed_forward_chunk(self, attention_output): + layer_output = self.layernorm(attention_output) + layer_output = self.mlp(layer_output) + return layer_output + + +class PerceiverEncoder(nn.Module): + """The Perceiver Encoder: a scalable, fully attentional encoder.""" + + def __init__(self, config): + super().__init__() + self.config = config + + # Check that we can use multihead-attention with these shapes. + if config.d_latents % config.num_self_attention_heads != 0: + raise ValueError( + f"num_z_channels ({config.d_latents}) must be divisible by" + f" num_self_attend_heads ({config.num_self_attention_heads})." + ) + if config.d_latents % config.num_cross_attention_heads != 0: + raise ValueError( + f"num_z_channels ({config.d_latents}) must be divisible by" + f" num_cross_attend_heads ({config.num_cross_attention_heads})." + ) + + # Construct the cross attention layer. + self.cross_attention = PerceiverLayer( + config, + is_cross_attention=True, + qk_channels=config.qk_channels, + v_channels=config.v_channels, + num_heads=config.num_cross_attention_heads, + q_dim=config.d_latents, + kv_dim=config.d_model, + widening_factor=config.cross_attention_widening_factor, + use_query_residual=config.use_query_residual, + ) + + # Construct a single block of self-attention layers. + # We get deeper architectures by applying this block more than once. + self_attention_layers = [] + for _ in range(config.num_self_attends_per_block): + layer = PerceiverLayer( + config, + is_cross_attention=False, + qk_channels=config.qk_channels, + v_channels=config.v_channels, + num_heads=config.num_self_attention_heads, + q_dim=config.d_latents, + kv_dim=config.d_latents, + widening_factor=config.self_attention_widening_factor, + ) + self_attention_layers.append(layer) + + self.self_attends = nn.ModuleList(self_attention_layers) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + inputs=None, + inputs_mask=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions else None + + # Apply the cross-attention between the latents (hidden_states) and inputs: + layer_outputs = self.cross_attention( + hidden_states, + attention_mask=attention_mask, + head_mask=None, + inputs=inputs, + inputs_mask=inputs_mask, + output_attentions=output_attentions, + ) + hidden_states = layer_outputs[0] + + if output_attentions: + all_cross_attentions = all_cross_attentions + (layer_outputs[1],) + + # Apply the block of self-attention layers more than once: + for _ in range(self.config.num_blocks): + for i, layer_module in enumerate(self.self_attends): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module( + hidden_states, + attention_mask=attention_mask, + head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [hidden_states, all_hidden_states, all_self_attentions, all_cross_attentions] + if v is not None + ) + return BaseModelOutputWithCrossAttentions( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class PerceiverPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = PerceiverConfig + base_model_prefix = "perceiver" + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif hasattr(module, "latents"): + module.latents.data.normal_(mean=0.0, std=self.config.initializer_range) + elif hasattr(module, "position_embeddings") and isinstance(module, PerceiverTrainablePositionEncoding): + module.position_embeddings.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, nn.ParameterDict): + for modality in module.keys(): + module[modality].data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +PERCEIVER_START_DOCSTRING = r""" + This model is a PyTorch `torch.nn.Module `_ sub-class. Use + it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config (:class:`~transformers.PerceiverConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model + weights. +""" + +PERCEIVER_MODEL_START_DOCSTRING = r""" + This model is a PyTorch `torch.nn.Module `_ sub-class. Use + it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config (:class:`~transformers.PerceiverConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model + weights. + decoder (`DecoderType`, `optional`): + Optional decoder to use to decode the latent representation of the encoder. Examples include + `transformers.models.perceiver.modeling_perceiver.PerceiverBasicDecoder`, + `transformers.models.perceiver.modeling_perceiver.PerceiverClassificationDecoder`, + `transformers.models.perceiver.modeling_perceiver.PerceiverMultimodalDecoder`. + input_preprocessor (`PreprocessorType`, `optional`): + Optional input preprocessor to use. Examples include + `transformers.models.perceiver.modeling_perceiver.PerceiverImagePreprocessor`, + `transformers.models.perceiver.modeling_perceiver.PerceiverAudioPreprocessor`, + `transformers.models.perceiver.modeling_perceiver.PerceiverTextPreprocessor`, + `transformers.models.perceiver.modeling_perceiver.PerceiverMultimodalPreprocessor`. + output_postprocessor (`PostprocessorType`, `optional`): + Optional output postprocessor to use. Examples include + `transformers.models.perceiver.modeling_perceiver.PerceiverImagePostprocessor`, + `transformers.models.perceiver.modeling_perceiver.PerceiverAudioPostprocessor`, + `transformers.models.perceiver.modeling_perceiver.PerceiverClassificationPostprocessor`, + `transformers.models.perceiver.modeling_perceiver.PerceiverProjectionPostprocessor`, + `transformers.models.perceiver.modeling_perceiver.PerceiverMultimodalPostprocessor`. + + Note that you can define your own decoders, preprocessors and/or postprocessors to fit your use-case. +""" + +PERCEIVER_INPUTS_DOCSTRING = r""" + Args: + inputs (:obj:`torch.FloatTensor`): + Inputs to the perceiver. Can be anything: images, text, audio, video, etc. + attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. +""" + + +@add_start_docstrings( + """The Perceiver: a scalable, fully attentional architecture.""", + PERCEIVER_MODEL_START_DOCSTRING, +) +class PerceiverModel(PerceiverPreTrainedModel): + def __init__( + self, + config, + decoder=None, + input_preprocessor: PreprocessorType = None, + output_postprocessor: PostprocessorType = None, + ): + super().__init__(config) + self.config = config + + self.input_preprocessor = input_preprocessor + self.output_postprocessor = output_postprocessor + self.embeddings = PerceiverEmbeddings(config) + self.encoder = PerceiverEncoder(config) + self.decoder = decoder + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.latents + + def set_input_embeddings(self, value): + self.embeddings.latents = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(PERCEIVER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=PerceiverModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + inputs, + attention_mask=None, + subsampled_output_points=None, + head_mask=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.input_preprocessor is not None: + inputs, modality_sizes, inputs_without_pos = self.input_preprocessor(inputs) + else: + modality_sizes = None + inputs_without_pos = None + + if inputs.size()[-1] != self.config.d_model: + raise ValueError( + f"Last dimension of the inputs: {inputs.size()[-1]} doesn't correspond to config.d_model: {self.config.d_model}. " + "Please update config.d_model appropriately." + ) + else: + input_shape = inputs.size() + + batch_size, seq_length, _ = input_shape + device = inputs.device + + # If no attention mask is provided, make them all ones + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length)), device=device) + # Make the attention mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + extended_attention_mask = self.invert_attention_mask(attention_mask) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_blocks x num_heads] + # and head_mask is converted to shape [num_blocks x batch x num_heads x N x N] + head_mask = self.get_head_mask(head_mask, self.config.num_blocks * self.config.num_self_attends_per_block) + + embedding_output = self.embeddings(batch_size=batch_size) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=None, + head_mask=head_mask, + inputs=inputs, + inputs_mask=extended_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + + logits = None + if self.decoder: + if subsampled_output_points is not None: + output_modality_sizes = { + "audio": subsampled_output_points["audio"].shape[0], + "image": subsampled_output_points["image"].shape[0], + "label": 1, + } + else: + output_modality_sizes = None + decoder_query = self.decoder.decoder_query( + inputs, modality_sizes, inputs_without_pos, subsampled_points=subsampled_output_points + ) + decoder_outputs = self.decoder( + decoder_query, + z=sequence_output, + query_mask=extended_attention_mask, + output_attentions=output_attentions, + ) + logits = decoder_outputs.logits + + # add cross-attentions of decoder + if output_attentions and decoder_outputs.cross_attentions is not None: + if return_dict: + encoder_outputs.cross_attentions = ( + encoder_outputs.cross_attentions + decoder_outputs.cross_attentions + ) + else: + encoder_outputs = encoder_outputs + decoder_outputs.cross_attentions + + if self.output_postprocessor: + logits = self.output_postprocessor(logits, modality_sizes=output_modality_sizes) + + if not return_dict: + if logits is not None: + return (logits, sequence_output) + encoder_outputs[1:] + else: + return (sequence_output,) + encoder_outputs[1:] + + return PerceiverModelOutput( + logits=logits, + last_hidden_state=sequence_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +@add_start_docstrings("""Example use of Perceiver for masked language modeling. """, PERCEIVER_START_DOCSTRING) +class PerceiverForMaskedLM(PerceiverPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + trainable_position_encoding_kwargs_decoder = dict( + num_channels=config.d_model, index_dims=config.max_position_embeddings + ) + + self.perceiver = PerceiverModel( + config, + input_preprocessor=PerceiverTextPreprocessor(config), + decoder=PerceiverBasicDecoder( + config, + output_num_channels=config.d_latents, + output_index_dims=config.max_position_embeddings, # we need to define the seq_len of the inputs beforehand + num_channels=config.d_model, + qk_channels=8 * 32, + v_channels=config.d_model, + num_heads=8, + use_query_residual=False, + final_project=False, + trainable_position_encoding_kwargs=trainable_position_encoding_kwargs_decoder, + ), + ) + self.embedding_decoder = PerceiverEmbeddingDecoder(config) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(PERCEIVER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=PerceiverMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + inputs=None, + attention_mask=None, + head_mask=None, + output_attentions=None, + output_hidden_states=None, + labels=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., + config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored + (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.perceiver( + inputs=inputs, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + logits = self.embedding_decoder( + outputs.logits if return_dict else outputs[0], embedding_layer=self.perceiver.input_preprocessor.embeddings + ) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return PerceiverMaskedLMOutput( + loss=masked_lm_loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +@add_start_docstrings("""Example use of Perceiver for text classification. """, PERCEIVER_START_DOCSTRING) +class PerceiverForSequenceClassification(PerceiverPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + trainable_position_encoding_kwargs_decoder = dict(num_channels=config.d_latents, index_dims=1) + + self.num_labels = config.num_labels + self.perceiver = PerceiverModel( + config, + input_preprocessor=PerceiverTextPreprocessor(config), + decoder=PerceiverClassificationDecoder( + config, + num_channels=config.d_latents, + trainable_position_encoding_kwargs=trainable_position_encoding_kwargs_decoder, + use_query_residual=True, + ), + ) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(PERCEIVER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=PerceiverClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + inputs=None, + attention_mask=None, + head_mask=None, + output_attentions=None, + output_hidden_states=None, + labels=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the classification/regression loss. Indices should be in :obj:`[0, ..., + config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). + + Returns: + + Examples:: + + >>> from transformers import PerceiverTokenizer, PerceiverForSequenceClassification + + >>> tokenizer = PerceiverTokenizer.from_pretrained('deepmind/language-perceiver') + >>> model = PerceiverForSequenceClassification.from_pretrained('deepmind/language-perceiver') + + >>> text = "hello world" + >>> inputs = tokenizer(images=image, return_tensors="pt").input_ids + >>> outputs = model(inputs=inputs) + >>> logits = outputs.logits + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.perceiver( + inputs=inputs, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + logits = outputs.logits if return_dict else outputs[0] + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return PerceiverClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +@add_start_docstrings( + """ +Example use of Perceiver for image classification, for tasks such as ImageNet. + +This model uses learned position embeddings. In other words, this model is not given any privileged information about +the structure of images. As shown in the paper, this model can achieve a top-1 accuracy of 72.7 on ImageNet. + +`PerceiverForImageClassificationLearned` uses +`transformers.models.perceiver.modeling_perceiver.PerceiverImagePreprocessor` (with `prep_type` = "conv1x1") to +preprocess the input images, and `transformers.models.perceiver.modeling_perceiver.PerceiverClassificationDecoder` to +decode the latent representation of `~transformers.PerceiverModel` into classification logits. +""", + PERCEIVER_START_DOCSTRING, +) +class PerceiverForImageClassificationLearned(PerceiverPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + trainable_position_encoding_kwargs_preprocessor = dict(num_channels=256, index_dims=config.image_size ** 2) + trainable_position_encoding_kwargs_decoder = dict(num_channels=config.d_latents, index_dims=1) + + self.num_labels = config.num_labels + self.perceiver = PerceiverModel( + config, + input_preprocessor=PerceiverImagePreprocessor( + config, + prep_type="conv1x1", + spatial_downsample=1, + out_channels=256, + position_encoding_type="trainable", + concat_or_add_pos="concat", + project_pos_dim=256, + trainable_position_encoding_kwargs=trainable_position_encoding_kwargs_preprocessor, + ), + decoder=PerceiverClassificationDecoder( + config, + num_channels=config.d_latents, + trainable_position_encoding_kwargs=trainable_position_encoding_kwargs_decoder, + use_query_residual=True, + ), + ) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(PERCEIVER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=PerceiverClassifierOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + inputs=None, + attention_mask=None, + head_mask=None, + output_attentions=None, + output_hidden_states=None, + labels=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the image classification/regression loss. Indices should be in :obj:`[0, ..., + config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). + + Returns: + + Examples:: + + >>> from transformers import PerceiverFeatureExtractor, PerceiverForImageClassificationLearned + >>> from PIL import Image + >>> import requests + + >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> feature_extractor = PerceiverFeatureExtractor.from_pretrained('deepmind/vision-perceiver-learned') + >>> model = PerceiverForImageClassificationLearned.from_pretrained('deepmind/vision-perceiver-learned') + + >>> inputs = feature_extractor(images=image, return_tensors="pt").pixel_values + >>> outputs = model(inputs=inputs) + >>> logits = outputs.logits + >>> # model predicts one of the 1000 ImageNet classes + >>> predicted_class_idx = logits.argmax(-1).item() + >>> print("Predicted class:", model.config.id2label[predicted_class_idx]) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.perceiver( + inputs=inputs, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + logits = outputs.logits if return_dict else outputs[0] + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return PerceiverClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +@add_start_docstrings( + """ +Example use of Perceiver for image classification, for tasks such as ImageNet. + +This model uses fixed 2D Fourier position embeddings. As shown in the paper, this model can achieve a top-1 accuracy of +79.0 on ImageNet, and 84.5 when pre-trained on a large-scale dataset (i.e. JFT). + +`PerceiverForImageClassificationLearned` uses +`transformers.models.perceiver.modeling_perceiver.PerceiverImagePreprocessor` (with `prep_type` = "pixels") to +preprocess the input images, and `transformers.models.perceiver.modeling_perceiver.PerceiverClassificationDecoder` to +decode the latent representation of `~transformers.PerceiverModel` into classification logits. +""", + PERCEIVER_START_DOCSTRING, +) +class PerceiverForImageClassificationFourier(PerceiverPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + fourier_position_encoding_kwargs_preprocessor = dict( + concat_pos=True, max_resolution=(224, 224), num_bands=64, sine_only=False + ) + trainable_position_encoding_kwargs_decoder = dict(num_channels=config.d_latents, index_dims=1) + + self.num_labels = config.num_labels + self.perceiver = PerceiverModel( + config, + input_preprocessor=PerceiverImagePreprocessor( + config, + prep_type="pixels", + spatial_downsample=1, + fourier_position_encoding_kwargs=fourier_position_encoding_kwargs_preprocessor, + ), + decoder=PerceiverClassificationDecoder( + config, + num_channels=config.d_latents, + trainable_position_encoding_kwargs=trainable_position_encoding_kwargs_decoder, + use_query_residual=True, + ), + ) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(PERCEIVER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=PerceiverClassifierOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + inputs=None, + attention_mask=None, + head_mask=None, + output_attentions=None, + output_hidden_states=None, + labels=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the image classification/regression loss. Indices should be in :obj:`[0, ..., + config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). + + Returns: + + Examples:: + + >>> from transformers import PerceiverFeatureExtractor, PerceiverForImageClassificationFourier + >>> from PIL import Image + >>> import requests + + >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> feature_extractor = PerceiverFeatureExtractor.from_pretrained('deepmind/vision-perceiver-fourier') + >>> model = PerceiverForImageClassificationFourier.from_pretrained('deepmind/vision-perceiver-fourier') + + >>> inputs = feature_extractor(images=image, return_tensors="pt").pixel_values + >>> outputs = model(inputs=inputs) + >>> logits = outputs.logits + >>> # model predicts one of the 1000 ImageNet classes + >>> predicted_class_idx = logits.argmax(-1).item() + >>> print("Predicted class:", model.config.id2label[predicted_class_idx]) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.perceiver( + inputs=inputs, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + logits = outputs.logits if return_dict else outputs[0] + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return PerceiverClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +@add_start_docstrings( + """ +Example use of Perceiver for image classification, for tasks such as ImageNet. + +This model uses a 2D conv+maxpool preprocessing network. As shown in the paper, this model can achieve a top-1 accuracy +of 82.1 on ImageNet. + +`PerceiverForImageClassificationLearned` uses +`transformers.models.perceiver.modeling_perceiver.PerceiverImagePreprocessor` (with `prep_type` = "conv") to preprocess +the input images, and `transformers.models.perceiver.modeling_perceiver.PerceiverClassificationDecoder` to decode the +latent representation of `~transformers.PerceiverModel` into classification logits. +""", + PERCEIVER_START_DOCSTRING, +) +class PerceiverForImageClassificationConvProcessing(PerceiverPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + fourier_position_encoding_kwargs_preprocessor = dict( + concat_pos=True, max_resolution=(56, 56), num_bands=64, sine_only=False + ) + trainable_position_encoding_kwargs_decoder = dict(num_channels=config.d_latents, index_dims=1) + + self.num_labels = config.num_labels + self.perceiver = PerceiverModel( + config, + input_preprocessor=PerceiverImagePreprocessor( + config, + prep_type="conv", + spatial_downsample=1, + position_encoding_type="fourier", + fourier_position_encoding_kwargs=fourier_position_encoding_kwargs_preprocessor, + ), + decoder=PerceiverClassificationDecoder( + config, + num_channels=config.d_latents, + trainable_position_encoding_kwargs=trainable_position_encoding_kwargs_decoder, + use_query_residual=True, + ), + ) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(PERCEIVER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=PerceiverClassifierOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + inputs=None, + attention_mask=None, + head_mask=None, + output_attentions=None, + output_hidden_states=None, + labels=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the image classification/regression loss. Indices should be in :obj:`[0, ..., + config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). + + Returns: + + Examples:: + + >>> from transformers import PerceiverFeatureExtractor, PerceiverForImageClassificationConvProcessing + >>> from PIL import Image + >>> import requests + + >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> feature_extractor = PerceiverFeatureExtractor.from_pretrained('deepmind/vision-perceiver-conv') + >>> model = PerceiverForImageClassificationConvProcessing.from_pretrained('deepmind/vision-perceiver-conv') + + >>> inputs = feature_extractor(images=image, return_tensors="pt").pixel_values + >>> outputs = model(inputs=inputs) + >>> logits = outputs.logits + >>> # model predicts one of the 1000 ImageNet classes + >>> predicted_class_idx = logits.argmax(-1).item() + >>> print("Predicted class:", model.config.id2label[predicted_class_idx]) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.perceiver( + inputs=inputs, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + logits = outputs.logits if return_dict else outputs[0] + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return PerceiverClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +@add_start_docstrings( + """ +Example use of Perceiver for optical flow, for tasks such as Sintel and KITTI. `PerceiverForOpticalFlow` uses +`transformers.models.perceiver.modeling_perceiver.PerceiverImagePreprocessor` (with `prep_type` = "patches") to +preprocess the input images, and `transformers.models.perceiver.modeling_perceiver.PerceiverOpticalFlowDecoder` to +decode the latent representation of `~transformers.PerceiverModel`. + +As input, one concatenates 2 subsequent frames along the channel dimension and extract a 3 x 3 patch around each pixel +(leading to 3 x 3 x 3 x 2 = 54 values for each pixel). Fixed Fourier position encodings are used to encode the position +of each pixel in the patch. Next, one applies the Perceiver encoder. To decode, one queries the latent representation +using the same encoding used for the input. +""", + PERCEIVER_START_DOCSTRING, +) +class PerceiverForOpticalFlow(PerceiverPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + fourier_position_encoding_kwargs_preprocessor = dict( + num_bands=64, + max_resolution=config.train_size, + sine_only=False, + concat_pos=True, + ) + fourier_position_encoding_kwargs_decoder = dict( + concat_pos=True, max_resolution=config.train_size, num_bands=64, sine_only=False + ) + + self.perceiver = PerceiverModel( + config, + input_preprocessor=PerceiverImagePreprocessor( + config, + prep_type="patches", + spatial_downsample=1, + conv_after_patching=True, + conv_after_patching_in_channels=54, + temporal_downsample=2, + position_encoding_type="fourier", + # position_encoding_kwargs + fourier_position_encoding_kwargs=fourier_position_encoding_kwargs_preprocessor, + ), + decoder=PerceiverOpticalFlowDecoder( + config, + num_channels=config.d_model, + output_image_shape=config.train_size, + rescale_factor=100.0, + # decoder kwargs + use_query_residual=False, + output_num_channels=2, + # We query the decoder using the first frame features + # rather than a standard decoder position encoding. + position_encoding_type="fourier", + fourier_position_encoding_kwargs=fourier_position_encoding_kwargs_decoder, + ), + ) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(PERCEIVER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=PerceiverClassifierOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + inputs=None, + attention_mask=None, + head_mask=None, + output_attentions=None, + output_hidden_states=None, + labels=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the optical flow loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. + + Returns: + + Examples:: + + >>> from transformers import PerceiverForOpticalFlow + >>> import torch + + >>> model = PerceiverForOpticalFlow.from_pretrained('deepmind/optical-flow-perceiver') + + >>> # in the Perceiver IO paper, the authors extract a 3 x 3 patch around each pixel, + >>> # leading to 3 x 3 x 3 = 27 values for each pixel (as each pixel also has 3 color channels) + >>> # patches have shape (batch_size, num_frames, num_channels, height, width) + >>> # the authors train on resolutions of 368 x 496 + >>> patches = torch.randn(1, 2, 27, 368, 496) + >>> outputs = model(inputs=patches) + >>> logits = outputs.logits + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.perceiver( + inputs=inputs, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + logits = outputs.logits if return_dict else outputs[0] + + loss = None + if labels is not None: + raise NotImplementedError("Optical flow training is not yet supported") + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return PerceiverClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +@add_start_docstrings( + """ +Example use of Perceiver for multimodal (video) autoencoding, for tasks such as Kinetics-700. + +`PerceiverForMultimodalAutoencoding` uses +`transformers.models.perceiver.modeling_perceiver.PerceiverMultimodalPreprocessor` to preprocess the 3 modalities: +images, audio and class labels. This preprocessor uses modality-specific preprocessors to preprocess every modality +separately, after which they are concatenated. Trainable position embeddings are used to pad each modality to the same +number of channels to make concatenation along the time dimension possible. Next, one applies the Perceiver encoder. + +`transformers.models.perceiver.modeling_perceiver.PerceiverMultimodalDecoder` is used to decode the latent +representation of `~transformers.PerceiverModel`. This decoder uses each modality-specific decoder to construct +queries. The decoder queries are created based on the inputs after preprocessing. However, autoencoding an entire video +in a single forward pass is computationally infeasible, hence one only uses parts of the decoder queries to do +cross-attention with the latent representation. This is determined by the subsampled indices for each modality, which +can be provided as additional input to the forward pass of `PerceiverForMultimodalAutoencoding`. + +`transformers.models.perceiver.modeling_perceiver.PerceiverMultimodalDecoder` also pads the decoder queries of the +different modalities to the same number of channels, in order to concatenate them along the time dimension. Next, +cross-attention is performed with the latent representation of `PerceiverModel`. + +Finally, `transformers.models.perceiver.modeling_perceiver.PerceiverMultiModalPostprocessor` is used to turn this +tensor into an actual video. It first splits up the output into the different modalities, and then applies the +respective postprocessor for each modality. + +Note that, by masking the classification label during evaluation (i.e. simply providing a tensor of zeros for the +"label" modality), this auto-encoding model becomes a Kinetics 700 video classifier. +""", + PERCEIVER_START_DOCSTRING, +) +class PerceiverForMultimodalAutoencoding(PerceiverPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + n_audio_samples = config.num_frames * config.audio_samples_per_frame + + input_preprocessor = PerceiverMultimodalPreprocessor( + min_padding_size=4, + modalities={ + "audio": PerceiverAudioPreprocessor( + config, + position_encoding_type="fourier", + fourier_position_encoding_kwargs=dict( + num_bands=192, + max_resolution=(n_audio_samples,), + sine_only=False, + concat_pos=True, + ), + prep_type="patches", + samples_per_patch=config.samples_per_patch, + ), + "image": PerceiverImagePreprocessor( + config, + position_encoding_type="fourier", + fourier_position_encoding_kwargs=dict( + num_bands=32, + max_resolution=(config.num_frames, config.image_size, config.image_size), + sine_only=False, + concat_pos=True, + ), + prep_type="patches", + spatial_downsample=4, + temporal_downsample=1, + ), + "label": PerceiverOneHotPreprocessor(config), + }, + mask_probs={"image": 0.0, "audio": 0.0, "label": 1.0}, + ) + + image_decoder = PerceiverBasicVideoAutoencodingDecoder( + config, + # Autoencoding, don't pass inputs to the queries. + concat_preprocessed_input=False, + output_shape=config.output_shape, + output_num_channels=512, + use_query_residual=False, + position_encoding_only=True, + position_encoding_type="fourier", + fourier_position_encoding_kwargs=dict( + num_bands=32, + max_resolution=(config.num_frames, config.image_size, config.image_size), + sine_only=False, + concat_pos=True, + ), + ) + + decoder = PerceiverMultimodalDecoder( + config, + # Autoencoding, don't pass inputs to the queries. + concat_preprocessed_input=False, + # Modality specific decoders are used ONLY to generate queries. + # All modalties are decoded together using a unified decoder. + modalities={ + "audio": PerceiverBasicDecoder( + config, + # Autoencoding, don't pass inputs to the queries. + concat_preprocessed_input=False, + output_index_dims=(n_audio_samples // config.samples_per_patch,), + output_num_channels=512, + use_query_residual=False, + position_encoding_only=True, + position_encoding_type="fourier", + fourier_position_encoding_kwargs=dict( + num_bands=192, + max_resolution=(n_audio_samples,), + sine_only=False, + concat_pos=True, + ), + ), + "image": image_decoder, + "label": PerceiverClassificationDecoder( + config, + # Autoencoding, don't pass inputs to the queries. + concat_preprocessed_input=False, + use_query_residual=False, + position_encoding_only=True, + position_encoding_type="trainable", + trainable_position_encoding_kwargs=dict( + num_channels=1024, + index_dims=1, + ), + ), + }, + num_outputs=None, + output_num_channels=512, + use_query_residual=False, + ) + + output_postprocessor = PerceiverMultimodalPostprocessor( + modalities={ + "audio": PerceiverAudioPostprocessor(config, in_channels=512), + "image": PerceiverProjectionPostprocessor(in_channels=512, out_channels=3), + "label": PerceiverClassificationPostprocessor(config, in_channels=512), + } + ) + + self.perceiver = PerceiverModel( + config, + input_preprocessor=input_preprocessor, + decoder=decoder, + output_postprocessor=output_postprocessor, + ) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(PERCEIVER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=PerceiverClassifierOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + inputs=None, + attention_mask=None, + subsampled_output_points=None, + head_mask=None, + output_attentions=None, + output_hidden_states=None, + labels=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the image classification/regression loss. Indices should be in :obj:`[0, ..., + config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). + + Returns: + + Examples:: + + >>> from transformers import PerceiverForMultimodalAutoencoding + >>> import torch + + >>> images = torch.randn((1, 16, 3, 224, 224)) + >>> audio = torch.randn((1, 30720, 1)) + >>> inputs = dict(image=images, audio=audio, label=torch.zeros((images.shape[0], 700))) + + >>> model = PerceiverForMultimodalAutoencoding.from_pretrained('deepmind/multimodal-perceiver') + + >>> outputs = model(inputs=inputs) + >>> logits = outputs.logits + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.perceiver( + inputs=inputs, + attention_mask=attention_mask, + subsampled_output_points=subsampled_output_points, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + logits = outputs.logits if return_dict else outputs[0] + + loss = None + if labels is not None: + raise NotImplementedError("Multimodal autoencoding training is not yet supported") + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return PerceiverClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +# Below: position encodings + + +def build_position_encoding( + position_encoding_type, + out_channels=None, + project_pos_dim=-1, + trainable_position_encoding_kwargs=None, + fourier_position_encoding_kwargs=None, +): + """ + Builds the position encoding. + + Args: + + - out_channels: refers to the number of channels of the position encodings. + - project_pos_dim: if specified, will project the position encodings to this dimension. + + """ + + if position_encoding_type == "trainable": + if not trainable_position_encoding_kwargs: + raise ValueError("Make sure to pass trainable_position_encoding_kwargs") + output_pos_enc = PerceiverTrainablePositionEncoding(**trainable_position_encoding_kwargs) + elif position_encoding_type == "fourier": + # We don't use the index_dims argument, as this is only known during the forward pass + if not fourier_position_encoding_kwargs: + raise ValueError("Make sure to pass fourier_position_encoding_kwargs") + output_pos_enc = PerceiverFourierPositionEncoding(**fourier_position_encoding_kwargs) + else: + raise ValueError(f"Unknown position encoding type: {position_encoding_type}.") + + # Optionally, project the position encoding to a target dimension: + positions_projection = nn.Linear(out_channels, project_pos_dim) if project_pos_dim > 0 else nn.Identity() + + return output_pos_enc, positions_projection + + +# Below: Perceiver decoders + + +class PerceiverAbstractDecoder(nn.Module, metaclass=abc.ABCMeta): + """Perceiver abstract decoder.""" + + @abc.abstractmethod + def decoder_query(self, inputs, modality_sizes=None, inputs_without_pos=None, subsampled_points=None): + raise NotImplementedError + + @property + @abc.abstractmethod + def num_query_channels(self): + raise NotImplementedError + + @abc.abstractmethod + def forward(self, query, z, query_mask=None): + raise NotImplementedError + + +class PerceiverProjectionDecoder(PerceiverAbstractDecoder): + """Baseline projection decoder (no cross-attention).""" + + def __init__(self, config): + super().__init__() + self.classifier = nn.Linear(config.d_latents, config.num_labels) + + def decoder_query(self, inputs, modality_sizes=None, inputs_without_pos=None, subsampled_points=None): + return None + + def forward(self, query, z, query_mask=None): + # (batch_size, num_latents, d_latents) -> (batch_size, d_latents) + z = torch.mean(z, dim=1) + # (batch_size, d_latents) -> (batch_size, config.num_labels) + logits = self.classifier(z) + return logits + + +class PerceiverBasicDecoder(PerceiverAbstractDecoder): + """ + Cross-attention-based decoder. + + Here, `output_num_channels` refers to the number of output channels. `num_channels` refers to the number of + channels of the output queries. + + """ + + def __init__( + self, + config, + output_num_channels, + position_encoding_type="trainable", + # The following 2 arguments are ignored if position_encoding_type == 'none': + output_index_dims=None, + num_channels=128, + subsampled_index_dims=None, + qk_channels=None, + v_channels=None, + num_heads=1, + widening_factor=1, + use_query_residual=False, + concat_preprocessed_input=False, + final_project=True, + position_encoding_only=False, + **position_encoding_kwargs, + ): + super().__init__() + + self.output_num_channels = output_num_channels + # If `none`, the decoder will not construct any position encodings. + # You should construct your own when quering the decoder. + self.output_position_encodings = None + self.position_encoding_type = position_encoding_type + self.position_encoding_kwargs = position_encoding_kwargs + if position_encoding_type != "none": + self.output_position_encodings, self.positions_projection = build_position_encoding( + position_encoding_type=position_encoding_type, **position_encoding_kwargs + ) + + self.output_index_dims = output_index_dims + self.num_channels = num_channels + if subsampled_index_dims is None: + subsampled_index_dims = output_index_dims + self.subsampled_index_dims = subsampled_index_dims + self.concat_preprocessed_input = concat_preprocessed_input + self.final_project = final_project + self.position_encoding_only = position_encoding_only + + # for multimodal autoencoding, we don't need the decoder cross-attention and final layer + # so then we will set position_encoding_only to True + if not self.position_encoding_only: + self.decoding_cross_attention = PerceiverLayer( + config, + is_cross_attention=True, + qk_channels=qk_channels, + v_channels=v_channels, + num_heads=num_heads, + q_dim=num_channels, + kv_dim=config.d_latents, + widening_factor=widening_factor, + use_query_residual=use_query_residual, + ) + self.final_layer = nn.Linear(num_channels, output_num_channels) if final_project else nn.Identity() + + @property + def num_query_channels(self) -> int: + if self.position_encoding_type == "none": # Queries come from elsewhere + raise ValueError( + "You cannot calculate number of decoder query channels when position_encoding_type is set to none" + ) + if self.position_encoding_only: + if "project_pos_dim" in self.position_encoding_kwargs: + return self.position_encoding_kwargs["project_pos_dim"] + return self.output_position_encodings.output_size() + if self.final_project: + return self.output_num_channels + return self.num_channels + + def decoder_query(self, inputs, modality_sizes=None, inputs_without_pos=None, subsampled_points=None): + if self.position_encoding_type == "none": # Queries come from elsewhere + raise ValueError("You cannot construct decoder queries when position_encoding_type is set to none") + if subsampled_points is not None: + # subsampled_points are the indices if the inputs would be flattened + # however, the inputs aren't flattened, that's why we use unravel_index + # to get the indices for the unflattened array + # unravel_index returns a tuple (x_idx, y_idx, ...) + # stack to get the [n, d] tensor of coordinates + indices = list(torch.from_numpy(x) for x in np.unravel_index(subsampled_points, self.output_index_dims)) + pos = torch.stack(indices, dim=1) + batch_size = inputs.shape[0] + # Map these coordinates to [-1, 1] + pos = -1 + 2 * pos / torch.tensor(self.output_index_dims)[None, :] + pos = torch.broadcast_to(pos[None], [batch_size, pos.shape[0], pos.shape[1]]) + # Construct the position encoding. + if self.position_encoding_type == "trainable": + pos_emb = self.output_position_encodings(batch_size) + elif self.position_encoding_type == "fourier": + pos_emb = self.output_position_encodings( + self.output_index_dims, batch_size=batch_size, device=inputs.device, pos=pos + ) + + # Optionally project them to a target dimension. + pos_emb = self.positions_projection(pos_emb) + pos_emb = torch.reshape(pos_emb, [pos_emb.shape[0], -1, pos_emb.shape[-1]]) + else: + batch_size = inputs.shape[0] + index_dims = inputs.shape[2:] + + # Construct the position encoding. + if self.position_encoding_type == "trainable": + pos_emb = self.output_position_encodings(batch_size) + elif self.position_encoding_type == "fourier": + pos_emb = self.output_position_encodings(index_dims, batch_size, device=inputs.device) + + # Optionally project them to a target dimension. + pos_emb = self.positions_projection(pos_emb) + + if self.concat_preprocessed_input: + if inputs_without_pos is None: + raise ValueError("Value is required for inputs_without_pos if concat_preprocessed_input is True") + pos_emb = torch.cat([inputs_without_pos, pos_emb], div=-1) + + return pos_emb + + def forward(self, query, z, query_mask=None, output_attentions=False): + # Cross-attention decoding. + # key, value: B x N x K; query: B x M x K + # Attention maps -> B x N x M + # Output -> B x M x K + cross_attentions = () if output_attentions else None + + layer_outputs = self.decoding_cross_attention( + query, + attention_mask=query_mask, + head_mask=None, + inputs=z, + inputs_mask=None, + output_attentions=output_attentions, + ) + output = layer_outputs[0] + + if output_attentions: + cross_attentions = cross_attentions + (layer_outputs[1],) + + logits = self.final_layer(output) + + return PerceiverDecoderOutput(logits=logits, cross_attentions=cross_attentions) + + +class PerceiverClassificationDecoder(PerceiverAbstractDecoder): + """ + Cross-attention based classification decoder. Light-weight wrapper of `BasicDecoder` for logit output. Will turn + the output of the Perceiver encoder which is of shape (batch_size, num_latents, d_latents) to a tensor of shape + (batch_size, num_labels). The queries are of shape (batch_size, 1, num_labels). + """ + + def __init__(self, config, **decoder_kwargs): + super().__init__() + + self.num_labels = config.num_labels + self.decoder = PerceiverBasicDecoder( + config, + output_num_channels=self.num_labels, + output_index_dims=1, # Predict a single logit array. + **decoder_kwargs, + ) + + @property + def num_query_channels(self) -> int: + return self.decoder.num_query_channels + + def decoder_query(self, inputs, modality_sizes=None, inputs_without_pos=None, subsampled_points=None): + return self.decoder.decoder_query( + inputs, modality_sizes, inputs_without_pos, subsampled_points=subsampled_points + ) + + def forward(self, query, z, query_mask=None, output_attentions=False): + decoder_outputs = self.decoder(query, z, output_attentions=output_attentions) + + # B x 1 x num_classes -> B x num_classes + logits = decoder_outputs.logits[:, 0, :] + + return PerceiverDecoderOutput(logits=logits, cross_attentions=decoder_outputs.cross_attentions) + + +class PerceiverOpticalFlowDecoder(PerceiverAbstractDecoder): + """Cross-attention based optical flow decoder.""" + + def __init__(self, config, output_image_shape, output_num_channels=2, rescale_factor=100.0, **decoder_kwargs): + super().__init__() + + self.output_image_shape = output_image_shape + self.output_num_channels = output_num_channels + self.rescale_factor = rescale_factor + self.decoder = PerceiverBasicDecoder(config, output_num_channels=output_num_channels, **decoder_kwargs) + + @property + def num_query_channels(self) -> int: + return self.decoder.num_query_channels + + def decoder_query(self, inputs, modality_sizes=None, inputs_without_pos=None, subsampled_points=None): + if subsampled_points is not None: + raise ValueError("FlowDecoder doesn't support subsampling yet.") + return inputs + + def forward(self, query, z, query_mask=None, output_attentions=False): + decoder_outputs = self.decoder(query, z, output_attentions=output_attentions) + preds = decoder_outputs.logits + # Output flow and rescale. + preds /= self.rescale_factor + preds = preds.reshape([preds.shape[0]] + list(self.output_image_shape) + [preds.shape[-1]]) + return PerceiverDecoderOutput(logits=preds, cross_attentions=decoder_outputs.cross_attentions) + + +class PerceiverBasicVideoAutoencodingDecoder(PerceiverAbstractDecoder): + """ + Cross-attention based video-autoencoding decoder. Light-weight wrapper of `BasicDecoder` with video reshaping + logic. + """ + + def __init__(self, config, output_shape, position_encoding_type, **decoder_kwargs): + super().__init__() + if len(output_shape) != 4: # B, T, H, W + raise ValueError(f"Expected rank 4 output_shape, got {output_shape}.") + # Build the decoder components: + self.output_shape = output_shape + self.output_num_channels = decoder_kwargs["output_num_channels"] + + self.decoder = PerceiverBasicDecoder( + config, + output_index_dims=self.output_shape[1:4], # T*H*W + position_encoding_type=position_encoding_type, + **decoder_kwargs, + ) + + @property + def num_query_channels(self) -> int: + return self.decoder.num_query_channels + + def decoder_query(self, inputs, modality_sizes=None, inputs_without_pos=None, subsampled_points=None): + return self.decoder.decoder_query( + inputs, + modality_sizes=modality_sizes, + inputs_without_pos=inputs_without_pos, + subsampled_points=subsampled_points, + ) + + def forward(self, query, z, query_mask=None): + decoder_outputs = self.decoder(query, z) + logits = decoder_outputs.logits + + logits = torch.reshape(logits, self.output_shape + [logits.shape[-1]]) + return PerceiverDecoderOutput(logits=logits, cross_attentions=decoder_outputs.cross_attentions) + + +def restructure(modality_sizes: ModalitySizeType, inputs: torch.Tensor) -> Mapping[str, torch.Tensor]: + """ + Partitions a [B, N, C] tensor into tensors for each modality. + + Args: + modality_sizes + dict specifying the size of the modality + inputs: + input tensor + + Returns: + dict mapping name of modality to its associated tensor. + """ + outputs = {} + index = 0 + # Apply a predictable ordering to the modalities + for modality in sorted(modality_sizes.keys()): + size = modality_sizes[modality] + inp = inputs[:, index : index + size] + index += size + outputs[modality] = inp + return outputs + + +class PerceiverMultimodalDecoder(PerceiverAbstractDecoder): + """ + Multimodal decoding by composing uni-modal decoders. The modalities argument of the constructor is a dictionary + mapping modality name to the decoder of that modality. That decoder will be used to construct queries for that + modality. However, there is a shared cross attention across all modalities, using the concatenated per-modality + query vectors. + """ + + def __init__( + self, + config, + modalities, + num_outputs, + output_num_channels, + min_padding_size=2, + subsampled_index_dims=None, + **decoder_kwargs + ): + super().__init__() + self.modalities = nn.ModuleDict(modalities) + self.subsampled_index_dims = subsampled_index_dims + self.min_padding_size = min_padding_size + self.output_num_channels = output_num_channels + self.num_outputs = num_outputs + self.decoder = PerceiverBasicDecoder( + config, + output_index_dims=(num_outputs,), + output_num_channels=output_num_channels, + position_encoding_type="none", + num_channels=self.num_query_channels, + **decoder_kwargs, + ) + self.padding = nn.ParameterDict( + { + modality: nn.Parameter(torch.randn(1, self.num_query_channels - decoder.num_query_channels)) + for modality, decoder in modalities.items() + } + ) + + @property + def num_query_channels(self) -> int: + max_channel_size = max(decoder.num_query_channels for _, decoder in self.modalities.items()) + common_channel_size = max_channel_size + self.min_padding_size + return common_channel_size + + def decoder_query(self, inputs, modality_sizes, inputs_without_pos=None, subsampled_points=None): + # Partition the flat inputs among the different modalities + inputs = restructure(modality_sizes, inputs) + + # Obtain modality-specific decoders' queries + subsampled_points = subsampled_points or dict() + + decoder_queries = dict() + for modality, decoder in self.modalities.items(): + # Get input_without_pos for this modality if it exists. + input_without_pos = None + if inputs_without_pos is not None: + input_without_pos = inputs_without_pos.get(modality, None) + query = decoder.decoder_query( + inputs=inputs[modality], + modality_sizes=None, + inputs_without_pos=input_without_pos, + subsampled_points=subsampled_points.get(modality, None), + ) + decoder_queries[modality] = query + + # Pad all queries with trainable position encodings to make them have the same channels + + def embed(modality, x): + x = torch.reshape(x, [x.shape[0], np.prod(x.shape[1:-1]), x.shape[-1]]) + pos = self.padding[modality] + pos = torch.broadcast_to(pos, [x.shape[0], x.shape[1], self.num_query_channels - x.shape[2]]) + return torch.cat([x, pos], dim=2) + + # Apply a predictable ordering to the modalities + return torch.cat( + [embed(modality, decoder_queries[modality]) for modality in sorted(self.modalities.keys())], dim=1 + ) + + def forward(self, query, z, query_mask=None, output_attentions=False): + # B x 1 x num_classes -> B x num_classes + decoder_outputs = self.decoder(query, z, output_attentions=output_attentions) + + return decoder_outputs + + +# Below: IO pre- and post-processor classes for Perceiver. +def space_to_depth(frames: torch.Tensor, temporal_block_size: int = 1, spatial_block_size: int = 1) -> torch.Tensor: + """ + Space to depth transform. Rearranges blocks of spatial data, into depth. + + This function assumes the channels to be first, but will place the channels last after transformation. + + Based on https://discuss.pytorch.org/t/is-there-any-layer-like-tensorflows-space-to-depth-function/3487/15. + """ + if len(frames.shape) == 4: + batch_size, num_channels, height, width = frames.shape + # split up dimensions (height by spatial_block_size, width by spatial_block_size) + frames = frames.view( + batch_size, + num_channels, + height // spatial_block_size, + spatial_block_size, + width // spatial_block_size, + spatial_block_size, + ) + # move blocks to last dimension: (batch_size, H//bs, W//bs, bs, bs, C) + frames = frames.permute(0, 2, 4, 3, 5, 1).contiguous() + # concatenate blocks along channel dimension: (batch_size, H//bs, W//bs, bs*bs*C) + frames = frames.view( + batch_size, + height // spatial_block_size, + width // spatial_block_size, + (spatial_block_size ** 2) * num_channels, + ) + return frames + elif len(frames.shape) == 5: + batch_size, time, num_channels, height, width = frames.shape + # split up dimensions (time by temporal_block_size, height by spatial_block_size, width by spatial_block_size) + frames = frames.view( + batch_size, + time // temporal_block_size, + temporal_block_size, + num_channels, + height // spatial_block_size, + spatial_block_size, + width // spatial_block_size, + spatial_block_size, + ) + # move blocks to last dimension: (batch_size, T//ts, H//bs, W//bs, ts, bs, bs, C) + frames = frames.permute(0, 1, 4, 6, 2, 5, 7, 3).contiguous() + # concatenate blocks along channel dimension: (batch_size, T//ts, H//bs, W//bs, ts*bs*bs*C) + frames = frames.view( + batch_size, + time // temporal_block_size, + height // spatial_block_size, + width // spatial_block_size, + temporal_block_size * (spatial_block_size ** 2) * num_channels, + ) + return frames + else: + raise ValueError( + "Frames should be of rank 4 (batch, channels, height, width)" + " or rank 5 (batch, time, channels, height, width)" + ) + + +class Conv2dSamePadding(nn.Conv2d): + """ + Conv2d layer with padding="same" support. Source: + https://gist.github.com/sumanmichael/4de9dee93f972d47c80c4ade8e149ea6 + """ + + def __init__(self, *args, **kwargs): + super(Conv2dSamePadding, self).__init__(*args, **kwargs) + self.zero_pad_2d = nn.ZeroPad2d( + reduce(__add__, [(k // 2 + (k - 2 * (k // 2)) - 1, k // 2) for k in self.kernel_size[::-1]]) + ) + + def forward(self, input): + return self._conv_forward(self.zero_pad_2d(input), self.weight, self.bias) + + +class Conv2DDownsample(nn.Module): + """Downsamples 4x by applying a 2D convolution and doing max pooling.""" + + def __init__( + self, + num_layers: int = 1, + in_channels: int = 3, + out_channels: int = 64, + use_batchnorm: bool = True, + ): + """ + Constructs a Conv2DDownsample model. + + Args: + in_channels (:obj:`int`, `optional`, defaults to 3): + The number of input channels. + out_channels (:obj:`int`, `optional`, defaults to 64): + The number of conv output channels. + use_batchnorm (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to use batchnorm. + """ + super().__init__() + + self.conv = Conv2dSamePadding( + in_channels=in_channels, out_channels=out_channels, kernel_size=7, stride=2, bias=False + ) + self.batchnorm = nn.BatchNorm2d(num_features=out_channels) if use_batchnorm else nn.Identity() + self.relu = nn.ReLU() + self.max_pool = nn.MaxPool2d(kernel_size=3, stride=2) + + def forward(self, inputs: torch.Tensor) -> torch.Tensor: + out = self.conv(inputs) + out = self.batchnorm(out) + out = self.relu(out) + out = self.max_pool(out) + return out + + +def generate_fourier_features(pos, num_bands, max_resolution=(224, 224), concat_pos=True, sine_only=False): + """ + Generate a Fourier frequency position encoding with linear spacing. + + Args: + pos (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length, dim)`): + The Tensor containing the position of n points in d dimensional space. + num_bands (:obj:`int`): + The number of frequency bands (K) to use. + max_resolution (:obj:`Tuple[int]`, `optional`, defaults to (224, 224)): + The maximum resolution (i.e. the number of pixels per dim). A tuple representing resolution for each dimension. + concat_pos (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to concatenate the input position encoding to the Fourier features. + sine_only (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to use a single phase (sin) or two (sin/cos) for each frequency band. + + Returns: + :obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, n_channels)`: The Fourier position + embeddings. If :obj:`concat_pos` is `True` and :obj:`sine_only` is `False`, output dimensions are ordered as: + [dim_1, dim_2, ..., dim_d, sin(pi*f_1*dim_1), ..., sin(pi*f_K*dim_1), ..., sin(pi*f_1*dim_d), ..., + sin(pi*f_K*dim_d), cos(pi*f_1*dim_1), ..., cos(pi*f_K*dim_1), ..., cos(pi*f_1*dim_d), ..., cos(pi*f_K*dim_d)], + where dim_i is pos[:, i] and f_k is the kth frequency band. + """ + + batch_size = pos.shape[0] + + min_freq = 1.0 + # Nyquist frequency at the target resolution: + freq_bands = torch.stack( + [torch.linspace(start=min_freq, end=res / 2, steps=num_bands) for res in max_resolution], dim=0 + ) + + # Get frequency bands for each spatial dimension. + # Output is size [n, d * num_bands] + per_pos_features = pos[0, :, :][:, :, None] * freq_bands[None, :, :] + per_pos_features = torch.reshape(per_pos_features, [-1, np.prod(per_pos_features.shape[1:])]) + + if sine_only: + # Output is size [n, d * num_bands] + per_pos_features = torch.sin(np.pi * (per_pos_features)) + else: + # Output is size [n, 2 * d * num_bands] + per_pos_features = torch.cat( + [torch.sin(np.pi * per_pos_features), torch.cos(np.pi * per_pos_features)], dim=-1 + ) + # Concatenate the raw input positions. + if concat_pos: + # Adds d bands to the encoding. + per_pos_features = torch.cat([pos, per_pos_features.expand(batch_size, -1, -1)], dim=-1) + return per_pos_features + + +def build_linear_positions(index_dims, output_range=(-1.0, 1.0)): + """ + Generate an array of position indices for an N-D input array. + + Args: + index_dims (:obj:`List[int]`): + The shape of the index dimensions of the input array. + output_range (:obj:`Tuple[float]`, `optional`, defaults to :obj:`(-1.0, 1.0)`): + The min and max values taken by each input index dimension. + + Returns: + :obj:`torch.FloatTensor` of shape :obj:`(index_dims[0], index_dims[1], .., index_dims[-1], N)`. + """ + + def _linspace(n_xels_per_dim): + return torch.linspace(start=output_range[0], end=output_range[1], steps=n_xels_per_dim, dtype=torch.float32) + + dim_ranges = [_linspace(n_xels_per_dim) for n_xels_per_dim in index_dims] + array_index_grid = torch.meshgrid(*dim_ranges) + + return torch.stack(array_index_grid, dim=-1) + + +class PerceiverAbstractPositionEncoding(nn.Module, metaclass=abc.ABCMeta): + """Perceiver abstract position encoding.""" + + @property + @abc.abstractmethod + def num_dimensions(self) -> int: + raise NotImplementedError + + @abc.abstractmethod + def output_size(self, *args, **kwargs) -> int: + raise NotImplementedError + + @abc.abstractmethod + def forward(self, batch_size, pos): + raise NotImplementedError + + +class PerceiverTrainablePositionEncoding(PerceiverAbstractPositionEncoding): + """Trainable position encoding.""" + + def __init__(self, index_dims, num_channels=128): + super().__init__() + self._num_channels = num_channels + self._index_dims = index_dims + index_dim = np.prod(index_dims) + self.position_embeddings = nn.Parameter(torch.randn(index_dim, num_channels)) + + @property + def num_dimensions(self) -> int: + if isinstance(self._index_dims, int): + return 1 + return len(self._index_dims) + + def output_size(self, *args, **kwargs) -> int: + return self._num_channels + + def forward(self, batch_size): + position_embeddings = self.position_embeddings + + if batch_size is not None: + position_embeddings = position_embeddings.expand(batch_size, -1, -1) + return position_embeddings + + +def _check_or_build_spatial_positions(pos, index_dims, batch_size): + """ + Checks or builds spatial position features (x, y, ...). + + Args: + pos (:obj:`torch.FloatTensor`): + None, or an array of position features. If None, position features are built. Otherwise, their size is checked. + index_dims (:obj:`List[int]`): + An iterable giving the spatial/index size of the data to be featurized. + batch_size (:obj:`int`): + The batch size of the data to be featurized. + + Returns: + :obj:`torch.FloatTensor` of shape :obj:`(batch_size, prod(index_dims))` an array of position features. + """ + if pos is None: + pos = build_linear_positions(index_dims) + pos = torch.broadcast_to(pos[None], (batch_size,) + pos.shape) + pos = torch.reshape(pos, [batch_size, np.prod(index_dims), -1]) + else: + # Just a warning label: you probably don't want your spatial features to + # have a different spatial layout than your pos coordinate system. + # But feel free to override if you think it'll work! + if pos.shape[-1] != len(index_dims): + raise ValueError("Spatial features have the wrong number of dimensions.") + return pos + + +class PerceiverFourierPositionEncoding(PerceiverAbstractPositionEncoding): + """Fourier (Sinusoidal) position encoding.""" + + def __init__(self, num_bands, max_resolution, concat_pos=True, sine_only=False): + super().__init__() + self.num_bands = num_bands + self.max_resolution = max_resolution + self.concat_pos = concat_pos + self.sine_only = sine_only + + @property + def num_dimensions(self) -> int: + return len(self.max_resolution) + + def output_size(self): + """Returns size of positional encodings last dimension.""" + num_dims = len(self.max_resolution) + encoding_size = self.num_bands * num_dims + if not self.sine_only: + encoding_size *= 2 + if self.concat_pos: + encoding_size += self.num_dimensions + + return encoding_size + + def forward(self, index_dims, batch_size, device, pos=None): + pos = _check_or_build_spatial_positions(pos, index_dims, batch_size) + fourier_pos_enc = generate_fourier_features( + pos, + num_bands=self.num_bands, + max_resolution=self.max_resolution, + concat_pos=self.concat_pos, + sine_only=self.sine_only, + ).to(device) + return fourier_pos_enc + + +class AbstractPreprocessor(nn.Module): + @property + def num_channels(self) -> int: + """Returns size of preprocessor output.""" + raise NotImplementedError() + + +class PerceiverTextPreprocessor(AbstractPreprocessor): + """Text preprocessing for Perceiver Encoder.""" + + def __init__(self, config): + super().__init__() + self.embeddings = nn.Embedding(num_embeddings=config.vocab_size, embedding_dim=config.d_model) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.d_model) + + @property + def num_channels(self) -> int: + return self.config.d_model + + def forward(self, inputs): + embeddings = self.embeddings(inputs) + + seq_length = inputs.shape[1] + position_ids = torch.arange(0, seq_length, device=inputs.device) + embeddings = embeddings + self.position_embeddings(position_ids) + + return embeddings, None, None + + +class PerceiverEmbeddingDecoder(nn.Module): + """Module to decode embeddings (for masked language modeling).""" + + def __init__(self, config): + """Constructs the module.""" + super().__init__() + self.config = config + self.vocab_size = config.vocab_size + self.bias = nn.Parameter(torch.zeros(self.vocab_size)) + + def forward(self, hidden_states, embedding_layer): + batch_size, seq_len, d_model = hidden_states.shape + output = torch.matmul(hidden_states.reshape([-1, d_model]), embedding_layer.weight.T) # Flatten batch dim + output = output + self.bias + + return output.reshape([batch_size, seq_len, self.vocab_size]) + + +class PerceiverMultimodalPostprocessor(nn.Module): + """ + Multimodal postprocessing for Perceiver. + + Args: + modalities (:obj:`Dict[str, PostprocessorType]`): + Dictionary mapping modality name to postprocessor class for that modality. + input_is_dict (:obj:`bool`, `optional`, defaults to :obj:`False`): + If True, input is assumed to be dictionary structured, and outputs keep the same dictionary shape. If + False, input is a tensor which is sliced up during postprocessing by `modality_sizes`. + """ + + def __init__(self, modalities: Mapping[str, PostprocessorType], input_is_dict: bool = False): + super().__init__() + self.modalities = nn.ModuleDict(modalities) + self.input_is_dict = input_is_dict + + def forward( + self, inputs: torch.Tensor, pos: Optional[torch.Tensor] = None, modality_sizes=None + ) -> Mapping[str, torch.Tensor]: + if not self.input_is_dict: + # Slice up modalities by their sizes. + if modality_sizes is None: + raise ValueError("Modality sizes should be specified if input is not a dictionary.") + inputs = restructure(modality_sizes=modality_sizes, inputs=inputs) + + outputs = { + modality: postprocessor(inputs[modality], pos=pos, modality_sizes=None) + for modality, postprocessor in self.modalities.items() + } + return outputs + + +class PerceiverClassificationPostprocessor(nn.Module): + """ + Classification postprocessing for Perceiver. Can be used to convert the decoder output to classification logits. + + Args: + config (:obj:`PerceiverConfig`): + Model configuration. + in_channels (:obj:`int`): + Number of channels in the input. + """ + + def __init__(self, config, in_channels): + super().__init__() + self.classifier = nn.Linear(in_channels, config.num_labels) + + def forward(self, inputs, pos: Optional[torch.Tensor] = None, modality_sizes=None) -> torch.Tensor: + logits = self.classifier(inputs) + return logits[:, 0, :] + + +class PerceiverAudioPostprocessor(nn.Module): + """ + Audio postprocessing for Perceiver. Can be used to convert the decoder output to audio features. + + Args: + config (:obj:`PerceiverConfig`): + Model configuration. + in_channels (:obj:`int`): + Number of channels in the input. + postproc_type (:obj:`str`, `optional`, defaults to :obj:`"patches"`): + Postprocessor type to use. Currently, only "patches" is supported. + """ + + def __init__(self, config, in_channels, postproc_type: str = "patches"): + super().__init__() + + if postproc_type not in ("patches",): # to be supported: 'conv', 'patches', 'pixels' + raise ValueError("Invalid postproc_type!") + + # Architecture parameters: + self.classifier = nn.Linear(in_channels, config.samples_per_patch) + + def forward(self, inputs: torch.Tensor, pos: Optional[torch.Tensor] = None, modality_sizes=None) -> torch.Tensor: + + logits = self.classifier(inputs) + return torch.reshape(logits, [inputs.shape[0], -1]) + + +class PerceiverProjectionPostprocessor(nn.Module): + """ + Projection postprocessing for Perceiver. Can be used to convert the project the channels of the decoder output to a + lower dimension. + + Args: + in_channels (:obj:`int`): + Number of channels in the input. + out_channels (:obj:`int`): + Number of channels in the output. + """ + + def __init__(self, in_channels, out_channels): + super().__init__() + self.classifier = nn.Linear(in_channels, out_channels) + + def forward(self, inputs: torch.Tensor, pos: Optional[torch.Tensor] = None, modality_sizes=None) -> torch.Tensor: + logits = self.classifier(inputs) + return logits + + +class PerceiverImagePreprocessor(AbstractPreprocessor): + """ + Image preprocessing for Perceiver Encoder. + + Note: the `out_channels` argument refers to the output channels of a convolutional layer, if `prep_type` is set to + "conv1x1" or "conv". If one adds absolute position embeddings, one must make sure the `num_channels` of the + position encoding kwargs are set equal to the `out_channels`. + + Args: + config (:obj:`PerceiverConfig`): + Model configuration. + prep_type (:obj:`str`, `optional`, defaults to :obj:`"conv"`): + Preprocessing type. Can be "conv1x1", "conv", "patches", "pixels". + spatial_downsample (:obj:`int`, `optional`, defaults to 4): + Spatial downsampling factor. + temporal_downsample (:obj:`int`, `optional`, defaults to 1): + Temporal downsampling factor (only relevant in case a time dimension is present). + position_encoding_type (:obj:`str`, `optional`, defaults to :obj:`"fourier"`): + Position encoding type. Can be "fourier" or "trainable". + in_channels (:obj:`int`, `optional`, defaults to 3): + Number of channels in the input. + out_channels (:obj:`int`, `optional`, defaults to 64): + Number of channels in the output. + conv_after_patching (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to apply a convolutional layer after patching. + conv_after_patching_in_channels (:obj:`int`, `optional`, defaults to 54): + Number of channels in the input of the convolutional layer after patching. + conv2d_use_batchnorm (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to use batch normalization in the convolutional layer. + concat_or_add_pos (:obj:`str`, `optional`, defaults to :obj:`"concat"`): + How to concatenate the position encoding to the input. Can be "concat" or "add". + project_pos_dim (:obj:`int`, `optional`, defaults to -1): + Dimension of the position encoding to project to. If -1, no projection is applied. + **position_encoding_kwargs (:obj:`Dict`, `optional`): + Keyword arguments for the position encoding. + """ + + def __init__( + self, + config, + prep_type="conv", + spatial_downsample: int = 4, + temporal_downsample: int = 1, + position_encoding_type: str = "fourier", + in_channels: int = 3, + out_channels: int = 64, + conv_after_patching: bool = False, + conv_after_patching_in_channels: int = 54, # only relevant when conv_after_patching = True + conv2d_use_batchnorm: bool = True, + concat_or_add_pos: str = "concat", + project_pos_dim: int = -1, + **position_encoding_kwargs, + ): + super().__init__() + self.config = config + + if prep_type not in ("conv", "patches", "pixels", "conv1x1"): + raise ValueError(f"Prep_type {prep_type} is invalid") + + if concat_or_add_pos not in ["concat", "add"]: + raise ValueError(f"Invalid value {concat_or_add_pos} for concat_or_add_pos.") + + self.in_channels = in_channels + self.prep_type = prep_type + self.spatial_downsample = spatial_downsample + self.temporal_downsample = temporal_downsample + self.position_encoding_type = position_encoding_type + self.concat_or_add_pos = concat_or_add_pos + self.conv_after_patching = conv_after_patching + self.out_channels = out_channels + + if self.prep_type == "conv": + # Downsampling with conv is currently restricted + convnet_num_layers = math.log(spatial_downsample, 4) + convnet_num_layers_is_int = convnet_num_layers == np.round(convnet_num_layers) + if not convnet_num_layers_is_int or temporal_downsample != 1: + raise ValueError( + "Only powers of 4 expected for spatial and 1 expected for temporal downsampling with conv." + ) + self.convnet = Conv2DDownsample( + in_channels=in_channels, + num_layers=int(convnet_num_layers), + out_channels=out_channels, + use_batchnorm=conv2d_use_batchnorm, + ) + + elif self.prep_type == "conv1x1": + if temporal_downsample != 1: + raise ValueError("Conv1x1 does not downsample in time.") + self.convnet_1x1 = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(1, 1), + # spatial_downsample is unconstrained for 1x1 convolutions. + stride=(spatial_downsample, spatial_downsample), + ) + + # Position embeddings + self.project_pos_dim = project_pos_dim + self.position_embeddings, self.positions_projection = build_position_encoding( + position_encoding_type=position_encoding_type, + out_channels=out_channels, + project_pos_dim=project_pos_dim, + **position_encoding_kwargs, + ) + + # Optional convolutional layer after patches. + self.conv_after_patches = ( + nn.Linear(conv_after_patching_in_channels, self.out_channels) if conv_after_patching else nn.Identity() + ) + + @property + def num_channels(self) -> int: + # Let's assume that the number of resolutions (in the context of image preprocessing) + # of the input data is 2 or 3 depending on whether we are processing image or video respectively. + # In this case, for convenience, we will declare is_temporal variable, + # which will show whether the data has a temporal dimension or not. + is_temporal = self.position_embeddings.num_dimensions > 2 + + # position embedding + if self.project_pos_dim > 0: + pos_dim = self.project_pos_dim + else: + pos_dim = self.position_embeddings.output_size() + if self.concat_or_add_pos == "add": + return pos_dim + + # inputs + if self.conv_after_patching or self.prep_type in ("conv1x1", "conv"): + inp_dim = self.out_channels + elif self.prep_type == "pixels": + inp_dim = self.in_channels + if not is_temporal: + inp_dim = math.ceil(inp_dim / self.spatial_downsample) + elif self.prep_type == "patches": + if self.conv_after_patching: + inp_dim = self.out_channels + else: + inp_dim = self.in_channels * self.spatial_downsample ** 2 + if is_temporal: + inp_dim *= self.temporal_downsample + + return inp_dim + pos_dim + + def _build_network_inputs(self, inputs: torch.Tensor, pos: torch.Tensor, network_input_is_1d: bool = True): + """ + Construct the final input, including position encoding. + + This method expects the inputs to always have channels as last dimension. + + """ + batch_size = inputs.shape[0] + index_dims = inputs.shape[1:-1] + indices = np.prod(index_dims) + + # Flatten input features to a 1D index dimension if necessary. + if len(inputs.shape) > 3 and network_input_is_1d: + inputs = torch.reshape(inputs, [batch_size, indices, -1]) + + # Construct the position encoding. + if self.position_encoding_type == "trainable": + pos_enc = self.position_embeddings(batch_size) + elif self.position_encoding_type == "fourier": + pos_enc = self.position_embeddings(index_dims, batch_size, device=inputs.device) + + # Optionally project them to a target dimension. + pos_enc = self.positions_projection(pos_enc) + + if not network_input_is_1d: + # Reshape pos to match the input feature shape + # if the network takes non-1D inputs + sh = inputs.shape + pos_enc = torch.reshape(pos_enc, list(sh)[:-1] + [-1]) + if self.concat_or_add_pos == "concat": + inputs_with_pos = torch.cat([inputs, pos_enc], dim=-1) + elif self.concat_or_add_pos == "add": + inputs_with_pos = inputs + pos_enc + return inputs_with_pos, inputs + + def forward(self, inputs: torch.Tensor, pos: Optional[torch.Tensor] = None, network_input_is_1d: bool = True): + if self.prep_type == "conv": + # Convnet image featurization. + # Downsamples spatially by a factor of 4 + inputs = self.convnet(inputs) + + elif self.prep_type == "conv1x1": + # map inputs to self.out_channels + inputs = self.convnet_1x1(inputs) + + elif self.prep_type == "pixels": + # if requested, downsamples in the crudest way + if inputs.ndim == 4: + inputs = inputs[:: self.spatial_downsample, :: self.spatial_downsample] + elif inputs.ndim == 5: + inputs = inputs[ + :, :: self.temporal_downsample, :, :: self.spatial_downsample, :: self.spatial_downsample + ] + else: + raise ValueError("Unsupported data format for pixels.") + + elif self.prep_type == "patches": + # Space2depth featurization. + # Video: B x T x C x H x W + inputs = space_to_depth( + inputs, temporal_block_size=self.temporal_downsample, spatial_block_size=self.spatial_downsample + ) + + if inputs.ndim == 5 and inputs.shape[1] == 1: + # for flow + inputs = inputs.squeeze(dim=1) + + # Optionally apply conv layer. + inputs = self.conv_after_patches(inputs) + + if self.prep_type != "patches": + # move channels to last dimension, as the _build_network_inputs method below expects this + if inputs.ndim == 4: + inputs = torch.moveaxis(inputs, 1, -1) + elif inputs.ndim == 5: + inputs = torch.moveaxis(inputs, 2, -1) + else: + raise ValueError("Unsupported data format for conv1x1.") + + inputs, inputs_without_pos = self._build_network_inputs(inputs, pos, network_input_is_1d) + modality_sizes = None # Size for each modality, only needed for multimodal + + return inputs, modality_sizes, inputs_without_pos + + +class PerceiverOneHotPreprocessor(AbstractPreprocessor): + """ + One-hot preprocessor for Perceiver Encoder. Can be used to add a dummy index dimension to the input. + + Args: + config (:obj:`PerceiverConfig`): + Model configuration. + """ + + def __init__(self, config): + super().__init__() + self.config: PerceiverConfig = config + + @property + def num_channels(self) -> int: + return self.config.num_labels + + def forward(self, inputs: torch.Tensor, pos: Optional[torch.Tensor] = None, network_input_is_1d: bool = True): + # Add a dummy index dimension. + inputs = inputs[:, None, :] + + # No position encodings, so the 1st (input) and 3rd (inputs_without_pos) + # outputs are identical. + return inputs, None, inputs + + +class PerceiverAudioPreprocessor(AbstractPreprocessor): + """ + Audio preprocessing for Perceiver Encoder. + + Args: + config (:obj:`PerceiverConfig`): + Model configuration. + prep_type (:obj:`str`, `optional`, defaults to :obj:`"patches"`): + Preprocessor type to use. Only "patches" is supported. + samples_per_patch (:obj:`int`, `optional`, defaults to 96): + Number of samples per patch. + position_encoding_type (:obj:`str`, `optional`, defaults to :obj:`"fourier"`): + Type of position encoding to use. Can be "trainable" or "fourier". + concat_or_add_pos (:obj:`str`, `optional`, defaults to :obj:`"concat"`): + How to concatenate the position encoding to the input. Can be "concat" or "add". + out_channels (:obj:`int`, `optional`, defaults to 64): + Number of channels in the output. + project_pos_dim (:obj:`int`, `optional`, defaults to -1): + Dimension of the position encoding to project to. If -1, no projection is applied. + **position_encoding_kwargs (:obj:`Dict`, `optional`): + Keyword arguments for the position encoding. + """ + + def __init__( + self, + config, + prep_type: str = "patches", + samples_per_patch: int = 96, + position_encoding_type: str = "fourier", + concat_or_add_pos: str = "concat", + out_channels=64, + project_pos_dim=-1, + **position_encoding_kwargs, + ): + super().__init__() + self.config = config + + if prep_type not in ("patches",): + raise ValueError(f"Prep_type {prep_type} is invalid, can only be 'patches'.") + + if concat_or_add_pos not in ["concat", "add"]: + raise ValueError(f"Concat_or_pos {concat_or_add_pos} is invalid, can only be 'concat' or 'add'.") + + self.samples_per_patch = samples_per_patch + self.position_encoding_type = position_encoding_type + self.concat_or_add_pos = concat_or_add_pos + self.project_pos_dim = project_pos_dim + + # Position embeddings + self.position_embeddings, self.positions_projection = build_position_encoding( + position_encoding_type=position_encoding_type, + out_channels=out_channels, + project_pos_dim=project_pos_dim, + **position_encoding_kwargs, + ) + + @property + def num_channels(self) -> int: + # position embedding + if self.project_pos_dim > 0: + pos_dim = self.project_pos_dim + else: + pos_dim = self.position_embeddings.output_size() + if self.concat_or_add_pos == "add": + return pos_dim + return self.samples_per_patch + pos_dim + + def _build_network_inputs(self, inputs, pos): + """Construct the final input, including position encoding.""" + batch_size = inputs.shape[0] + index_dims = inputs.shape[1:-1] + + # Construct the position encoding. + if self.position_encoding_type == "trainable": + pos_enc = self.position_embeddings(batch_size) + elif self.position_encoding_type == "fourier": + pos_enc = self.position_embeddings(index_dims, batch_size, device=inputs.device) + + # Optionally project them to a target dimension. + pos_enc = self.positions_projection(pos_enc) + + if self.concat_or_add_pos == "concat": + inputs_with_pos = torch.cat([inputs, pos_enc], dim=-1) + elif self.concat_or_add_pos == "add": + inputs_with_pos = inputs + pos_enc + + return inputs_with_pos, inputs + + def forward(self, inputs, pos, network_input_is_1d: bool = True): + inputs = torch.reshape(inputs, [inputs.shape[0], -1, self.samples_per_patch]) + + inputs, inputs_without_pos = self._build_network_inputs(inputs, pos) + modality_sizes = None # Size for each modality, only needed for multimodal + + return inputs, modality_sizes, inputs_without_pos + + +class PerceiverMultimodalPreprocessor(AbstractPreprocessor): + """ + Multimodal preprocessing for Perceiver Encoder. + + Inputs for each modality are preprocessed, then padded with trainable position embeddings to have the same number + of channels. + + Args: + modalities (:obj:`Dict[str, PreprocessorType]`): + Dict mapping modality name to preprocessor. + mask_probs (:obj:`Dict[str, float]`): + Dict mapping modality name to masking probability of that modality. + min_padding_size (:obj:`int`, `optional`, defaults to 2): + The minimum padding size for all modalities. The final output will have num_channels equal to the maximum + channels across all modalities plus min_padding_size. + """ + + def __init__( + self, + modalities: Mapping[str, PreprocessorType], + mask_probs: Optional[Mapping[str, float]] = None, + min_padding_size: int = 2, + ): + super().__init__() + self.modalities = modalities + self.min_padding_size = min_padding_size + self.mask_probs = mask_probs if mask_probs is not None else dict() + self.padding = nn.ParameterDict( + { + modality: nn.Parameter(torch.randn(1, self.num_channels - preprocessor.num_channels)) + for modality, preprocessor in modalities.items() + } + ) + self.mask = nn.ParameterDict( + {modality: nn.Parameter(torch.randn(1, self.num_channels)) for modality, _ in self.mask_probs.items()} + ) + + @property + def num_channels(self) -> int: + max_channel_size = max(processor.num_channels for _, processor in self.modalities.items()) + common_channel_size = max_channel_size + self.min_padding_size + return common_channel_size + + def forward( + self, inputs: Mapping[str, torch.Tensor], pos: Optional[torch.Tensor] = None, network_input_is_1d: bool = True + ) -> PreprocessorOutputType: + padded = {} + modality_sizes = {} + inputs_without_pos = {} + for modality, preprocessor in self.modalities.items(): + # preprocess each modality using the respective preprocessor. + output, _, inputs_without_pos[modality] = preprocessor( + inputs[modality], pos=pos, network_input_is_1d=network_input_is_1d + ) + + # pad to the same common_channel_size. + batch_size, num_samples, num_channels = output.shape + pos_enc = self.padding[modality].expand(batch_size, -1, -1) + + padding = torch.broadcast_to( + pos_enc, + [batch_size, num_samples, self.num_channels - num_channels], + ) + output_padded = torch.cat([output, padding], dim=2) + + # mask if required + if modality in self.mask_probs: + mask_token = self.mask[modality].expand(batch_size, -1, -1) + mask_prob = self.mask_probs[modality] + mask = torch.bernoulli(torch.full([batch_size, num_samples], mask_prob)) + mask = torch.unsqueeze(mask, dim=2).to(mask_token.device) + output_padded = (1 - mask) * output_padded + mask * mask_token + + padded[modality] = output_padded + modality_sizes[modality] = output_padded.shape[1] + + # Apply a predictable ordering to the modalities + padded_ls = [padded[k] for k in sorted(padded.keys())] + + # Finally, concatenate along the time dimension + final_inputs = torch.cat(padded_ls, dim=1) + + return final_inputs, modality_sizes, inputs_without_pos diff --git a/src/transformers/models/perceiver/tokenization_perceiver.py b/src/transformers/models/perceiver/tokenization_perceiver.py new file mode 100644 index 0000000000..9349c5aac4 --- /dev/null +++ b/src/transformers/models/perceiver/tokenization_perceiver.py @@ -0,0 +1,204 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Tokenization class for Perceiver.""" + + +from typing import Dict, List, Optional, Tuple + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class PerceiverTokenizer(PreTrainedTokenizer): + """ + Construct a Perceiver tokenizer. The Perceiver simply uses raw bytes utf-8 encoding. + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods. + Users should refer to this superclass for more information regarding those methods. + + Args: + pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + bos_token (:obj:`str`, `optional`, defaults to :obj:`"[BOS]"`): + The BOS token (reserved in the vocab, but not actually used). + eos_token (:obj:`str`, `optional`, defaults to :obj:`"[EOS]"`): + The end of sequence token (reserved in the vocab, but not actually used). + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the end of + sequence. The token used is the :obj:`sep_token`. + mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): + The MASK token, useful for masked language modeling. + cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`): + The CLS token (reserved in the vocab, but not actually used). + sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): + The separator token, which is used when building a sequence from two sequences. + + """ + + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + pad_token="[PAD]", + bos_token="[BOS]", + eos_token="[EOS]", + mask_token="[MASK]", + cls_token="[CLS]", + sep_token="[SEP]", + model_max_length=2048, + **kwargs + ) -> None: + + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + mask_token = AddedToken(mask_token, lstrip=False, rstrip=False) if isinstance(mask_token, str) else mask_token + cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token + sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token + + super().__init__( + pad_token=pad_token, + bos_token=bos_token, + eos_token=eos_token, + mask_token=mask_token, + cls_token=cls_token, + sep_token=sep_token, + model_max_length=model_max_length, + **kwargs, + ) + + self._utf_vocab_size = 2 ** 8 # utf is 8 bits + + # define special tokens dict + self.special_tokens_encoder: Dict[int, str] = { + self.pad_token: 0, + self.bos_token: 1, + self.eos_token: 2, + self.mask_token: 3, + self.cls_token: 4, + self.sep_token: 5, + } + self._num_special_tokens = len(self.special_tokens_encoder) + self.special_tokens_decoder: Dict[str, int] = {v: k for k, v in self.special_tokens_encoder.items()} + + @property + def vocab_size(self): + return self._utf_vocab_size + self._num_special_tokens + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` method. + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + # normal case: some special tokens + if token_ids_1 is None: + return [1] + [0] * len(token_ids_0) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks. A sequence has the + following format: + + - single sequence: ``[CLS] X [SEP]`` + - pair of sequences: ``[CLS] A [SEP] B [SEP]`` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + else: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + token_ids_1 + [self.sep_token_id] + + def _tokenize(self, text: str) -> List[str]: + """Take as input a string and return a list of strings (tokens) for words/sub-words""" + tokens = [chr(i) for i in text.encode("utf-8")] + return tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + if token in self.special_tokens_encoder: + token_id = self.special_tokens_encoder[token] + elif token in self.added_tokens_encoder: + token_id = self.added_tokens_encoder[token] + elif len(token) != 1: + token_id = self.unk_token_id + else: + token_id = ord(token) + self._num_special_tokens + return token_id + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.special_tokens_decoder: + token = self.special_tokens_decoder[index] + elif index in self.added_tokens_decoder: + token = self.added_tokens_decoder[index] + else: + token = chr(index - self._num_special_tokens) + return token + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + bstring = b"" + for token in tokens: + if token in self.special_tokens_decoder: + tok_string = self.special_tokens_decoder[token].encode("utf-8") + elif token in self.added_tokens_decoder: + tok_string = self.special_tokens_decoder[token].encode("utf-8") + elif token in self.special_tokens_encoder: + tok_string = token.encode("utf-8") + elif token in self.added_tokens_encoder: + tok_string = token.encode("utf-8") + else: + tok_string = bytes([ord(token)]) + bstring += tok_string + string = bstring.decode("utf-8", errors="replace") + return string + + # PerceiverTokenizer has no vocab file + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + return () diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 3ab5f6a416..efd82bf0f0 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -3753,6 +3753,87 @@ class PegasusPreTrainedModel: requires_backends(self, ["torch"]) +PERCEIVER_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class PerceiverForImageClassificationConvProcessing: + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class PerceiverForImageClassificationFourier: + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class PerceiverForImageClassificationLearned: + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class PerceiverForMaskedLM: + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + def forward(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class PerceiverForMultimodalAutoencoding: + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class PerceiverForOpticalFlow: + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class PerceiverForSequenceClassification: + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + def forward(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class PerceiverLayer: + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class PerceiverModel: + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + def forward(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class PerceiverPreTrainedModel: + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + def forward(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index a9f2c9e949..5347757624 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -64,6 +64,11 @@ class LayoutXLMProcessor: requires_backends(cls, ["vision"]) +class PerceiverFeatureExtractor: + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + class SegformerFeatureExtractor: def __init__(self, *args, **kwargs): requires_backends(self, ["vision"]) diff --git a/tests/test_modeling_perceiver.py b/tests/test_modeling_perceiver.py new file mode 100644 index 0000000000..c946d49f0b --- /dev/null +++ b/tests/test_modeling_perceiver.py @@ -0,0 +1,989 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the PyTorch Perceiver model. """ + +import copy +import inspect +import math +import tempfile +import unittest +import warnings +from typing import Dict, List, Tuple + +import numpy as np +from datasets import load_dataset + +from transformers import PerceiverConfig +from transformers.file_utils import is_torch_available, is_vision_available +from transformers.models.auto import get_values +from transformers.testing_utils import require_torch, require_vision, slow, torch_device + +from .test_configuration_common import ConfigTester +from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import ( + MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, + MODEL_FOR_MASKED_LM_MAPPING, + MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, + MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, + MODEL_MAPPING, + PerceiverForImageClassificationConvProcessing, + PerceiverForImageClassificationFourier, + PerceiverForImageClassificationLearned, + PerceiverForMaskedLM, + PerceiverForMultimodalAutoencoding, + PerceiverForOpticalFlow, + PerceiverForSequenceClassification, + PerceiverModel, + PerceiverTokenizer, + ) + from transformers.models.perceiver.modeling_perceiver import PERCEIVER_PRETRAINED_MODEL_ARCHIVE_LIST + + +if is_vision_available(): + from PIL import Image + + from transformers import PerceiverFeatureExtractor + + +class PerceiverModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + num_channels=3, + image_size=32, + train_size=[20, 20], + num_frames=5, + audio_samples_per_frame=200, + samples_per_patch=20, + nchunks=20, + num_latents=10, + d_latents=20, + num_blocks=1, + num_self_attends_per_block=2, + num_self_attention_heads=1, + num_cross_attention_heads=1, + is_training=True, + use_input_mask=True, + use_labels=True, + vocab_size=99, + hidden_act="gelu", + attention_probs_dropout_prob=0.1, + initializer_range=0.02, + max_position_embeddings=7, + num_labels=3, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.num_channels = num_channels + self.image_size = image_size + self.train_size = train_size + self.num_frames = num_frames + self.audio_samples_per_frame = audio_samples_per_frame + self.samples_per_patch = samples_per_patch + self.nchunks = nchunks + self.num_latents = num_latents + self.d_latents = d_latents + self.num_blocks = num_blocks + self.num_self_attends_per_block = num_self_attends_per_block + self.num_self_attention_heads = num_self_attention_heads + self.num_cross_attention_heads = num_cross_attention_heads + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_act = hidden_act + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.num_labels = num_labels + self.scope = scope + # set subsampling for multimodal model (take first chunk) + image_chunk_size = np.prod((self.num_frames, self.image_size, self.image_size)) // self.nchunks + audio_chunk_size = self.num_frames * self.audio_samples_per_frame // self.samples_per_patch // self.nchunks + self.subsampling = { + "image": torch.arange(0, image_chunk_size), + "audio": torch.arange(0, audio_chunk_size), + "label": None, + } + + def prepare_config_and_inputs(self, model_class=None): + config = self.get_config() + + input_mask = None + sequence_labels = None + token_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.num_labels) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + + if model_class is None or model_class.__name__ == "PerceiverModel": + inputs = floats_tensor([self.batch_size, self.seq_length, config.d_model], self.vocab_size) + return config, inputs, input_mask, sequence_labels, token_labels + elif model_class.__name__ in ["PerceiverForMaskedLM", "PerceiverForSequenceClassification"]: + inputs = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + # input mask is only relevant for text inputs + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + elif model_class.__name__ == "PerceiverForImageClassificationLearned": + config.d_model = 512 + inputs = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + elif model_class.__name__ == "PerceiverForImageClassificationFourier": + config.d_model = 261 + inputs = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + elif model_class.__name__ == "PerceiverForImageClassificationConvProcessing": + config.d_model = 322 + inputs = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + elif model_class.__name__ == "PerceiverForOpticalFlow": + config.d_model = 322 + inputs = floats_tensor([self.batch_size, 2, 27, self.train_size[0], self.train_size[1]]) + elif model_class.__name__ == "PerceiverForMultimodalAutoencoding": + config.d_model = 409 + images = torch.randn( + (self.batch_size, self.num_frames, self.num_channels, self.image_size, self.image_size), + device=torch_device, + ) + audio = torch.randn( + (self.batch_size, self.num_frames * self.audio_samples_per_frame, 1), device=torch_device + ) + inputs = dict( + image=images, audio=audio, label=torch.zeros((self.batch_size, self.num_labels), device=torch_device) + ) + else: + raise ValueError(f"Model class {model_class} not supported") + + return config, inputs, input_mask, sequence_labels, token_labels + + def get_config(self): + return PerceiverConfig( + num_latents=self.num_latents, + d_latents=self.d_latents, + num_blocks=self.num_blocks, + num_self_attends_per_block=self.num_self_attends_per_block, + num_self_attention_heads=self.num_self_attention_heads, + num_cross_attention_heads=self.num_cross_attention_heads, + vocab_size=self.vocab_size, + hidden_act=self.hidden_act, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + initializer_range=self.initializer_range, + max_position_embeddings=self.max_position_embeddings, + image_size=self.image_size, + train_size=self.train_size, + num_frames=self.num_frames, + audio_samples_per_frame=self.audio_samples_per_frame, + samples_per_patch=self.samples_per_patch, + num_labels=self.num_labels, + ) + + def create_and_check_for_masked_lm(self, config, inputs, input_mask, sequence_labels, token_labels): + model = PerceiverForMaskedLM(config=config) + model.to(torch_device) + model.eval() + result = model(inputs, attention_mask=input_mask, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_for_sequence_classification(self, config, inputs, input_mask, sequence_labels, token_labels): + # set num_labels + config.num_labels = self.num_labels + model = PerceiverForSequenceClassification(config=config) + model.to(torch_device) + model.eval() + result = model(inputs, attention_mask=input_mask, labels=sequence_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_for_image_classification_learned( + self, config, inputs, input_mask, sequence_labels, token_labels + ): + # set d_model and num_labels + config.d_model = 512 + config.num_labels = self.num_labels + model = PerceiverForImageClassificationLearned(config=config) + model.to(torch_device) + model.eval() + result = model(inputs, attention_mask=input_mask, labels=sequence_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_for_image_classification_fourier( + self, config, inputs, input_mask, sequence_labels, token_labels + ): + # set d_model and num_labels + config.d_model = 261 + config.num_labels = self.num_labels + model = PerceiverForImageClassificationFourier(config=config) + model.to(torch_device) + model.eval() + result = model(inputs, attention_mask=input_mask, labels=sequence_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_for_image_classification_conv( + self, config, inputs, input_mask, sequence_labels, token_labels + ): + # set d_model and num_labels + config.d_model = 322 + config.num_labels = self.num_labels + model = PerceiverForImageClassificationConvProcessing(config=config) + model.to(torch_device) + model.eval() + result = model(inputs, attention_mask=input_mask, labels=sequence_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, inputs, input_mask, sequence_labels, token_labels = config_and_inputs + inputs_dict = {"inputs": inputs, "attention_mask": input_mask} + return config, inputs_dict + + def prepare_config_and_inputs_for_model_class(self, model_class): + config_and_inputs = self.prepare_config_and_inputs(model_class) + config, inputs, input_mask, sequence_labels, token_labels = config_and_inputs + inputs_dict = {"inputs": inputs, "attention_mask": input_mask} + + return config, inputs_dict + + +@require_torch +class PerceiverModelTest(ModelTesterMixin, unittest.TestCase): + + all_model_classes = ( + ( + PerceiverModel, + PerceiverForMaskedLM, + PerceiverForImageClassificationLearned, + PerceiverForImageClassificationConvProcessing, + PerceiverForImageClassificationFourier, + PerceiverForOpticalFlow, + PerceiverForMultimodalAutoencoding, + PerceiverForSequenceClassification, + ) + if is_torch_available() + else () + ) + test_pruning = False + test_head_masking = False + test_torchscript = False + + maxDiff = None + + def setUp(self): + self.model_tester = PerceiverModelTester(self) + self.config_tester = ConfigTester(self, config_class=PerceiverConfig, hidden_size=37) + + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = copy.deepcopy(inputs_dict) + + if model_class.__name__ == "PerceiverForMultimodalAutoencoding": + inputs_dict["subsampled_output_points"] = self.model_tester.subsampling + + if return_labels: + if model_class in [ + *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING), + *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING), + ]: + inputs_dict["labels"] = torch.zeros( + self.model_tester.batch_size, dtype=torch.long, device=torch_device + ) + elif model_class in [ + *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING), + *get_values(MODEL_FOR_MASKED_LM_MAPPING), + ]: + inputs_dict["labels"] = torch.zeros( + (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device + ) + return inputs_dict + + def test_config(self): + # we don't test common_properties and arguments_init as these don't apply for Perceiver + self.config_tester.create_and_test_config_to_json_string() + self.config_tester.create_and_test_config_to_json_file() + self.config_tester.create_and_test_config_from_and_save_pretrained() + self.config_tester.create_and_test_config_with_num_labels() + self.config_tester.check_config_can_be_init_without_params() + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs(model_class=PerceiverForMaskedLM) + self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs(model_class=PerceiverForSequenceClassification) + self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) + + def test_for_image_classification_learned(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs( + model_class=PerceiverForImageClassificationLearned + ) + self.model_tester.create_and_check_for_image_classification_learned(*config_and_inputs) + + def test_for_image_classification_fourier(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs( + model_class=PerceiverForImageClassificationFourier + ) + self.model_tester.create_and_check_for_image_classification_fourier(*config_and_inputs) + + def test_for_image_classification_conv(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs( + model_class=PerceiverForImageClassificationConvProcessing + ) + self.model_tester.create_and_check_for_image_classification_conv(*config_and_inputs) + + def test_model_common_attributes(self): + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_model_class(model_class) + model = model_class(config) + # we overwrite this, as the embeddings of Perceiver are an instance of nn.Parameter + # and Perceiver doesn't support get_output_embeddings + self.assertIsInstance(model.get_input_embeddings(), (nn.Parameter)) + + def test_training(self): + if not self.model_tester.is_training: + return + + for model_class in self.all_model_classes: + if model_class in [ + *get_values(MODEL_MAPPING), + PerceiverForOpticalFlow, + PerceiverForMultimodalAutoencoding, + ]: + continue + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_model_class(model_class) + config.return_dict = True + + model = model_class(config) + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + def test_forward_signature(self): + for model_class in self.all_model_classes: + config, _ = self.model_tester.prepare_config_and_inputs_for_model_class(model_class) + + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["inputs"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_determinism(self): + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_model_class(model_class) + + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + inputs_dict = self._prepare_for_class(inputs_dict, model_class) + first = model(**inputs_dict)[0] + second = model(**inputs_dict)[0] + + if model_class.__name__ == "PerceiverForMultimodalAutoencoding": + # model outputs a dictionary with logits per modality, let's verify each modality + for modality in first.keys(): + out_1 = first[modality].cpu().numpy() + out_2 = second[modality].cpu().numpy() + out_1 = out_1[~np.isnan(out_1)] + out_2 = out_2[~np.isnan(out_2)] + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) + else: + out_1 = first.cpu().numpy() + out_2 = second.cpu().numpy() + out_1 = out_1[~np.isnan(out_1)] + out_2 = out_2[~np.isnan(out_2)] + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) + + def test_attention_outputs(self): + seq_len = getattr(self.model_tester, "num_latents", None) + + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_model_class(model_class) + config.return_dict = True + + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + self_attentions = outputs.attentions + cross_attentions = outputs.cross_attentions + + # check expected number of attentions depending on model class + expected_num_self_attentions = self.model_tester.num_blocks * self.model_tester.num_self_attends_per_block + if model.__class__.__name__ == "PerceiverModel": + # we expect to have 2 cross-attentions, namely one in the PerceiverEncoder, and one in PerceiverBasicDecoder + expected_num_cross_attentions = 1 + else: + # we expect to have 2 cross-attentions, namely one in the PerceiverEncoder, and one in PerceiverBasicDecoder + expected_num_cross_attentions = 2 + self.assertEqual(len(self_attentions), expected_num_self_attentions) + self.assertEqual(len(cross_attentions), expected_num_cross_attentions) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + self_attentions = outputs.attentions + cross_attentions = outputs.cross_attentions + self.assertEqual(len(self_attentions), expected_num_self_attentions) + self.assertEqual(len(cross_attentions), expected_num_cross_attentions) + + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [self.model_tester.num_self_attention_heads, seq_len, seq_len], + ) + out_len = len(outputs) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + self.assertEqual(out_len + 1, len(outputs)) + + self_attentions = outputs.attentions + + self.assertEqual(len(self_attentions), expected_num_self_attentions) + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [self.model_tester.num_self_attention_heads, seq_len, seq_len], + ) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.hidden_states + + expected_num_layers = self.model_tester.num_blocks * self.model_tester.num_self_attends_per_block + 1 + self.assertEqual(len(hidden_states), expected_num_layers) + + seq_length = self.model_tester.num_latents + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [seq_length, self.model_tester.d_latents], + ) + + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_model_class(model_class) + + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + def test_model_outputs_equivalence(self): + def set_nan_tensor_to_zero(t): + t[t != t] = 0 + return t + + def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): + with torch.no_grad(): + tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs) + dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple() + + def recursive_check(tuple_object, dict_object): + if isinstance(tuple_object, (List, Tuple)): + for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif isinstance(tuple_object, Dict): + for tuple_iterable_value, dict_iterable_value in zip( + tuple_object.values(), dict_object.values() + ): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif tuple_object is None: + return + else: + self.assertTrue( + torch.allclose( + set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5 + ), + msg=f"Tuple and dict output are not equal. Difference: {torch.max(torch.abs(tuple_object - dict_object))}. " + f"Tuple has `nan`: {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. " + f"Dict has `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}.", + ) + + recursive_check(tuple_output, dict_output) + + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_model_class(model_class) + + model = model_class(config) + model.to(torch_device) + model.eval() + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs) + + if model_class.__name__ not in ["PerceiverForOpticalFlow", "PerceiverForMultimodalAutoencoding"]: + # optical flow + multimodal models don't support training for now + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) + + if model_class.__name__ not in ["PerceiverForOpticalFlow", "PerceiverForMultimodalAutoencoding"]: + # optical flow + multimodal models don't support training for now + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + if model_class.__name__ not in ["PerceiverForOpticalFlow", "PerceiverForMultimodalAutoencoding"]: + # optical flow + multimodal models don't support training for now + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) + + if model_class.__name__ not in ["PerceiverForOpticalFlow", "PerceiverForMultimodalAutoencoding"]: + # optical flow + multimodal models don't support training for now + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence( + model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True} + ) + + def test_retain_grad_hidden_states_attentions(self): + # no need to test all models as different heads yield the same functionality + model_class = PerceiverForMaskedLM + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_model_class(model_class) + config.output_hidden_states = True + config.output_attentions = True + + model = model_class(config) + model.to(torch_device) + + inputs = self._prepare_for_class(inputs_dict, model_class) + + outputs = model(**inputs) + + output = outputs[0] + + # Encoder-only model + hidden_states = outputs.hidden_states[0] + attentions = outputs.attentions[0] + + hidden_states.retain_grad() + attentions.retain_grad() + + output.flatten()[0].backward(retain_graph=True) + + self.assertIsNotNone(hidden_states.grad) + self.assertIsNotNone(attentions.grad) + + def test_feed_forward_chunking(self): + for model_class in self.all_model_classes: + original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_model_class(model_class) + torch.manual_seed(0) + config = copy.deepcopy(original_config) + model = model_class(config) + model.to(torch_device) + model.eval() + + hidden_states_no_chunk = model(**self._prepare_for_class(inputs_dict, model_class))[0] + + torch.manual_seed(0) + config.chunk_size_feed_forward = 1 + model = model_class(config) + model.to(torch_device) + model.eval() + + hidden_states_with_chunk = model(**self._prepare_for_class(inputs_dict, model_class))[0] + if model_class.__name__ == "PerceiverForMultimodalAutoencoding": + # model outputs a dictionary with logits for each modality + for modality in hidden_states_no_chunk.keys(): + self.assertTrue( + torch.allclose(hidden_states_no_chunk[modality], hidden_states_with_chunk[modality], atol=1e-3) + ) + else: + self.assertTrue(torch.allclose(hidden_states_no_chunk, hidden_states_with_chunk, atol=1e-3)) + + def test_save_load(self): + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_model_class(model_class) + + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if model_class.__name__ == "PerceiverForMultimodalAutoencoding": + for modality in outputs[0].keys(): + out_2 = outputs[0][modality].cpu().numpy() + out_2[np.isnan(out_2)] = 0 + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model = model_class.from_pretrained(tmpdirname) + model.to(torch_device) + with torch.no_grad(): + after_outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + # Make sure we don't have nans + out_1 = after_outputs[0][modality].cpu().numpy() + out_1[np.isnan(out_1)] = 0 + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) + + else: + out_2 = outputs[0].cpu().numpy() + out_2[np.isnan(out_2)] = 0 + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model = model_class.from_pretrained(tmpdirname) + model.to(torch_device) + with torch.no_grad(): + after_outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + # Make sure we don't have nans + out_1 = after_outputs[0].cpu().numpy() + out_1[np.isnan(out_1)] = 0 + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) + + def test_correct_missing_keys(self): + if not self.test_missing_keys: + return + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + # most Perceiver models don't have a typical head like is the case with BERT + if model_class in [ + PerceiverForOpticalFlow, + PerceiverForMultimodalAutoencoding, + *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING), + *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING), + ]: + continue + + model = model_class(config) + base_model_prefix = model.base_model_prefix + + if hasattr(model, base_model_prefix): + with tempfile.TemporaryDirectory() as temp_dir_name: + model.base_model.save_pretrained(temp_dir_name) + model, loading_info = model_class.from_pretrained(temp_dir_name, output_loading_info=True) + with self.subTest(msg=f"Missing keys for {model.__class__.__name__}"): + self.assertGreater(len(loading_info["missing_keys"]), 0) + + def test_problem_types(self): + problem_types = [ + {"title": "multi_label_classification", "num_labels": 2, "dtype": torch.float}, + {"title": "single_label_classification", "num_labels": 1, "dtype": torch.long}, + {"title": "regression", "num_labels": 1, "dtype": torch.float}, + ] + + for model_class in self.all_model_classes: + if model_class not in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING): + continue + + config, inputs, input_mask, _, _ = self.model_tester.prepare_config_and_inputs(model_class=model_class) + inputs_dict = dict(inputs=inputs, attention_mask=input_mask) + + for problem_type in problem_types: + with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"): + + config.problem_type = problem_type["title"] + config.num_labels = problem_type["num_labels"] + + model = model_class(config) + model.to(torch_device) + model.train() + + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + + if problem_type["num_labels"] > 1: + inputs["labels"] = inputs["labels"].unsqueeze(1).repeat(1, problem_type["num_labels"]) + + inputs["labels"] = inputs["labels"].to(problem_type["dtype"]) + + # This tests that we do not trigger the warning form PyTorch "Using a target size that is different + # to the input size. This will likely lead to incorrect results due to broadcasting. Please ensure + # they have the same size." which is a symptom something in wrong for the regression problem. + # See https://github.com/huggingface/transformers/issues/11780 + with warnings.catch_warnings(record=True) as warning_list: + loss = model(**inputs).loss + for w in warning_list: + if "Using a target size that is different to the input size" in str(w.message): + raise ValueError( + f"Something is going wrong in the regression problem: intercepted {w.message}" + ) + + loss.backward() + + @unittest.skip(reason="Perceiver models don't have a typical head like is the case with BERT") + def test_save_load_fast_init_from_base(self): + pass + + @unittest.skip(reason="Perceiver models don't have a typical head like is the case with BERT") + def test_save_load_fast_init_to_base(self): + pass + + @unittest.skip(reason="Perceiver doesn't support resize_token_embeddings") + def test_resize_tokens_embeddings(self): + pass + + @unittest.skip(reason="Perceiver doesn't support resize_token_embeddings") + def test_resize_embeddings_untied(self): + pass + + @unittest.skip(reason="Perceiver doesn't support inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Perceiver doesn't support the AutoModel API") + def test_load_with_mismatched_shapes(self): + pass + + @slow + def test_model_from_pretrained(self): + for model_name in PERCEIVER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = PerceiverModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +# Helper functions for optical flow integration test +def prepare_optical_flow_images(): + dataset = load_dataset("hf-internal-testing/fixtures_sintel", split="test") + image1 = Image.open(dataset[0]["file"]).convert("RGB") + image2 = Image.open(dataset[0]["file"]).convert("RGB") + + return image1, image2 + + +def normalize(img): + return img / 255.0 * 2 - 1 + + +def extract_image_patches(x, kernel, stride=1, dilation=1): + # Do TF 'SAME' Padding + b, c, h, w = x.shape + h2 = math.ceil(h / stride) + w2 = math.ceil(w / stride) + pad_row = (h2 - 1) * stride + (kernel - 1) * dilation + 1 - h + pad_col = (w2 - 1) * stride + (kernel - 1) * dilation + 1 - w + x = torch.nn.functional.pad(x, (pad_row // 2, pad_row - pad_row // 2, pad_col // 2, pad_col - pad_col // 2)) + + # Extract patches + patches = x.unfold(2, kernel, stride).unfold(3, kernel, stride) + patches = patches.permute(0, 4, 5, 1, 2, 3).contiguous() + + return patches.view(b, -1, patches.shape[-2], patches.shape[-1]) + + +@require_torch +@require_vision +class PerceiverModelIntegrationTest(unittest.TestCase): + @slow + def test_inference_masked_lm(self): + + tokenizer = PerceiverTokenizer.from_pretrained("deepmind/language-perceiver") + model = PerceiverForMaskedLM.from_pretrained("deepmind/language-perceiver") + model.to(torch_device) + + # prepare inputs + text = "This is an incomplete sentence where some words are missing." + encoding = tokenizer(text, padding="max_length", return_tensors="pt") + + # mask " missing.". + encoding.input_ids[0, 52:61] = tokenizer.mask_token_id + inputs, input_mask = encoding.input_ids.to(torch_device), encoding.attention_mask.to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(inputs=inputs, attention_mask=input_mask) + logits = outputs.logits + + # verify logits + expected_shape = torch.Size((1, tokenizer.model_max_length, tokenizer.vocab_size)) + self.assertEqual(logits.shape, expected_shape) + + expected_slice = torch.tensor( + [[-10.8609, -10.7651, -10.9187], [-12.1689, -11.9389, -12.1479], [-12.1518, -11.9707, -12.2073]] + ) + + self.assertTrue(torch.allclose(logits[0, :3, :3], expected_slice, atol=1e-4)) + + expected_greedy_predictions = [38, 115, 111, 121, 121, 111, 116, 109, 52] + masked_tokens_predictions = logits[0, 52:61].argmax(dim=-1).tolist() + self.assertListEqual(expected_greedy_predictions, masked_tokens_predictions) + + @slow + def test_inference_image_classification(self): + + feature_extractor = PerceiverFeatureExtractor() + model = PerceiverForImageClassificationLearned.from_pretrained("deepmind/vision-perceiver-learned") + model.to(torch_device) + + # prepare inputs + image = prepare_img() + inputs = feature_extractor(image, return_tensors="pt").pixel_values.to(torch_device) + input_mask = None + + # forward pass + with torch.no_grad(): + outputs = model(inputs=inputs, attention_mask=input_mask) + logits = outputs.logits + + # verify logits + expected_shape = torch.Size((1, model.config.num_labels)) + self.assertEqual(logits.shape, expected_shape) + + expected_slice = torch.tensor([-1.1653, -0.1993, -0.7521], device=torch_device) + + self.assertTrue(torch.allclose(logits[0, :3], expected_slice, atol=1e-4)) + + @slow + def test_inference_image_classification_fourier(self): + + feature_extractor = PerceiverFeatureExtractor() + model = PerceiverForImageClassificationFourier.from_pretrained("deepmind/vision-perceiver-fourier") + model.to(torch_device) + + # prepare inputs + image = prepare_img() + inputs = feature_extractor(image, return_tensors="pt").pixel_values.to(torch_device) + input_mask = None + + # forward pass + with torch.no_grad(): + outputs = model(inputs=inputs, attention_mask=input_mask) + logits = outputs.logits + + # verify logits + expected_shape = torch.Size((1, model.config.num_labels)) + self.assertEqual(logits.shape, expected_shape) + + expected_slice = torch.tensor([-1.1295, -0.2832, 0.3226], device=torch_device) + + self.assertTrue(torch.allclose(logits[0, :3], expected_slice, atol=1e-4)) + + @slow + def test_inference_image_classification_conv(self): + + feature_extractor = PerceiverFeatureExtractor() + model = PerceiverForImageClassificationConvProcessing.from_pretrained("deepmind/vision-perceiver-conv") + model.to(torch_device) + + # prepare inputs + image = prepare_img() + inputs = feature_extractor(image, return_tensors="pt").pixel_values.to(torch_device) + input_mask = None + + # forward pass + with torch.no_grad(): + outputs = model(inputs=inputs, attention_mask=input_mask) + logits = outputs.logits + + # verify logits + expected_shape = torch.Size((1, model.config.num_labels)) + self.assertEqual(logits.shape, expected_shape) + + expected_slice = torch.tensor([-1.1186, 0.0554, 0.0897], device=torch_device) + + self.assertTrue(torch.allclose(logits[0, :3], expected_slice, atol=1e-4)) + + @slow + def test_inference_optical_flow(self): + model = PerceiverForOpticalFlow.from_pretrained("deepmind/optical-flow-perceiver") + model.to(torch_device) + + # prepare inputs + image1, image2 = prepare_optical_flow_images() + img1 = normalize(np.array(image1)) + img2 = normalize(np.array(image1)) + + # stack images + img1 = torch.tensor(np.moveaxis(img1, -1, 0)) + img2 = torch.tensor(np.moveaxis(img2, -1, 0)) + images = torch.stack([img1, img2], dim=0) + + # extract 3x3 patches + patch_size = model.config.train_size + + inputs = images[..., : patch_size[0], : patch_size[1]].unsqueeze(0) + batch_size, _, C, H, W = inputs.shape + patches = extract_image_patches(inputs.view(batch_size * 2, C, H, W), kernel=3) + _, C, H, W = patches.shape + patches = patches.view(batch_size, -1, C, H, W).float() + + # forward pass + with torch.no_grad(): + outputs = model(inputs=patches) + logits = outputs.logits + + # verify logits + expected_shape = torch.Size((1, 368, 496, 2)) + self.assertEqual(logits.shape, expected_shape) + + expected_slice = torch.tensor( + [ + [[0.0025, -0.0050], [0.0025, -0.0049], [0.0025, -0.0048]], + [[0.0026, -0.0049], [0.0026, -0.0048], [0.0026, -0.0047]], + [[0.0026, -0.0049], [0.0026, -0.0048], [0.0026, -0.0046]], + ], + device=torch_device, + ) + + self.assertTrue(torch.allclose(logits[0, :3, :3, :3], expected_slice, atol=1e-4)) diff --git a/tests/test_tokenization_perceiver.py b/tests/test_tokenization_perceiver.py new file mode 100644 index 0000000000..119a82df8a --- /dev/null +++ b/tests/test_tokenization_perceiver.py @@ -0,0 +1,288 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import re +import shutil +import tempfile +import unittest +from typing import Tuple + +from transformers import AddedToken, BatchEncoding, PerceiverTokenizer +from transformers.file_utils import cached_property, is_tf_available, is_torch_available + +from .test_tokenization_common import TokenizerTesterMixin + + +if is_torch_available(): + FRAMEWORK = "pt" +elif is_tf_available(): + FRAMEWORK = "tf" +else: + FRAMEWORK = "jax" + + +class PerceiverTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + + tokenizer_class = PerceiverTokenizer + test_rust_tokenizer = False + + def setUp(self): + super().setUp() + tokenizer = PerceiverTokenizer() + tokenizer.save_pretrained(self.tmpdirname) + + @cached_property + def perceiver_tokenizer(self): + return PerceiverTokenizer.from_pretrained("deepmind/language-perceiver") + + def get_tokenizer(self, **kwargs) -> PerceiverTokenizer: + return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + + def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> Tuple[str, list]: + # XXX The default common tokenizer tests assume that every ID is decodable on its own. + # This assumption is invalid for Perceiver because single bytes might not be + # valid utf-8 (byte 128 for instance). + # Here we're overriding the smallest possible method to provide + # a clean sequence without making the same assumption. + + toks = [] + for i in range(len(tokenizer)): + try: + tok = tokenizer.decode([i], clean_up_tokenization_spaces=False) + except UnicodeDecodeError: + pass + toks.append((i, tok)) + + toks = list(filter(lambda t: re.match(r"^[ a-zA-Z]+$", t[1]), toks)) + toks = list(filter(lambda t: [t[0]] == tokenizer.encode(t[1], add_special_tokens=False), toks)) + if max_length is not None and len(toks) > max_length: + toks = toks[:max_length] + if min_length is not None and len(toks) < min_length and len(toks) > 0: + while len(toks) < min_length: + toks = toks + toks + # toks_str = [t[1] for t in toks] + toks_ids = [t[0] for t in toks] + + # Ensure consistency + output_txt = tokenizer.decode(toks_ids, clean_up_tokenization_spaces=False) + if " " not in output_txt and len(toks_ids) > 1: + output_txt = ( + tokenizer.decode([toks_ids[0]], clean_up_tokenization_spaces=False) + + " " + + tokenizer.decode(toks_ids[1:], clean_up_tokenization_spaces=False) + ) + if with_prefix_space: + output_txt = " " + output_txt + output_ids = tokenizer.encode(output_txt, add_special_tokens=False) + return output_txt, output_ids + + def test_multibytes_char(self): + tokenizer = self.perceiver_tokenizer + src_text = "Unicode €." + encoded = tokenizer(src_text) + encoded_ids = [4, 91, 116, 111, 105, 117, 106, 107, 38, 232, 136, 178, 52, 5] + self.assertEqual(encoded["input_ids"], encoded_ids) + + # decoding + decoded = tokenizer.decode(encoded_ids) + self.assertEqual(decoded, "Unicode €.") + + encoded = tokenizer("e è é ê ë") + encoded_ids = [4, 107, 38, 201, 174, 38, 201, 175, 38, 201, 176, 38, 201, 177, 5] + self.assertEqual(encoded["input_ids"], encoded_ids) + # decoding + decoded = tokenizer.decode(encoded_ids) + self.assertEqual(decoded, "e è é ê ë") + + # encode/decode, but with `encode` instead of `__call__` + self.assertEqual(tokenizer.decode(tokenizer.encode("e è é ê ë")), "e è é ê ë") + + def test_prepare_batch_integration(self): + tokenizer = self.perceiver_tokenizer + src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."] + # fmt: off + expected_src_tokens = [4, 71, 38, 114, 117, 116, 109, 38, 118, 103, 120, 103, 109, 120, 103, 118, 110, 38, 108, 117, 120, 38, 121, 123, 115, 115, 103, 120, 111, 128, 103, 122, 111, 117, 116, 52, 5, 0] + # fmt: on + batch = tokenizer(src_text, padding=True, return_tensors=FRAMEWORK) + self.assertIsInstance(batch, BatchEncoding) + + if FRAMEWORK != "jax": + result = list(batch.input_ids.numpy()[0]) + else: + result = list(batch.input_ids.tolist()[0]) + + self.assertListEqual(expected_src_tokens, result) + + self.assertEqual((2, 38), batch.input_ids.shape) + self.assertEqual((2, 38), batch.attention_mask.shape) + + def test_empty_target_text(self): + tokenizer = self.perceiver_tokenizer + src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."] + batch = tokenizer(src_text, padding=True, return_tensors=FRAMEWORK) + # check if input_ids are returned and no decoder_input_ids + self.assertIn("input_ids", batch) + self.assertIn("attention_mask", batch) + self.assertNotIn("decoder_input_ids", batch) + self.assertNotIn("decoder_attention_mask", batch) + + def test_max_length_integration(self): + tokenizer = self.perceiver_tokenizer + tgt_text = [ + "Summary of the text.", + "Another summary.", + ] + with tokenizer.as_target_tokenizer(): + targets = tokenizer( + tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors=FRAMEWORK + ) + self.assertEqual(32, targets["input_ids"].shape[1]) + + # cannot use default save_and_load_tokenzier test method because tokenzier has no vocab + def test_save_and_load_tokenizer(self): + # safety check on max_len default value so we are sure the test works + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + self.assertNotEqual(tokenizer.model_max_length, 42) + + # Now let's start the test + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + # Isolate this from the other tests because we save additional tokens/etc + tmpdirname = tempfile.mkdtemp() + + sample_text = " He is very happy, UNwant\u00E9d,running" + before_tokens = tokenizer.encode(sample_text, add_special_tokens=False) + tokenizer.save_pretrained(tmpdirname) + + after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname) + after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False) + self.assertListEqual(before_tokens, after_tokens) + + shutil.rmtree(tmpdirname) + + tokenizers = self.get_tokenizers(model_max_length=42) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + # Isolate this from the other tests because we save additional tokens/etc + tmpdirname = tempfile.mkdtemp() + + sample_text = " He is very happy, UNwant\u00E9d,running" + tokenizer.add_tokens(["bim", "bambam"]) + additional_special_tokens = tokenizer.additional_special_tokens + additional_special_tokens.append("new_additional_special_token") + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + before_tokens = tokenizer.encode(sample_text, add_special_tokens=False) + tokenizer.save_pretrained(tmpdirname) + + after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname) + after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False) + self.assertListEqual(before_tokens, after_tokens) + self.assertIn("new_additional_special_token", after_tokenizer.additional_special_tokens) + self.assertEqual(after_tokenizer.model_max_length, 42) + + tokenizer = tokenizer.__class__.from_pretrained(tmpdirname, model_max_length=43) + self.assertEqual(tokenizer.model_max_length, 43) + + shutil.rmtree(tmpdirname) + + # There is a conflict between the default value of extra_ids and adding a new special token through additional_special_tokens + # We need to add the extra_ids in the list of the arg additional_special_tokens + def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self): + tokenizer_list = [] + if self.test_slow_tokenizer: + tokenizer_list.append((self.tokenizer_class, self.get_tokenizer())) + + if self.test_rust_tokenizer: + tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer())) + + for tokenizer_class, tokenizer_utils in tokenizer_list: + with tempfile.TemporaryDirectory() as tmp_dir: + tokenizer_utils.save_pretrained(tmp_dir) + + with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file: + special_tokens_map = json.load(json_file) + + with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file: + tokenizer_config = json.load(json_file) + + added_tokens_extra_ids = [f"" for i in range(125)] + + special_tokens_map["additional_special_tokens"] = added_tokens_extra_ids + [ + "an_additional_special_token" + ] + tokenizer_config["additional_special_tokens"] = added_tokens_extra_ids + [ + "an_additional_special_token" + ] + + with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile: + json.dump(special_tokens_map, outfile) + with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile: + json.dump(tokenizer_config, outfile) + + # the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes + # into account the new value of additional_special_tokens given in the "tokenizer_config.json" and + # "special_tokens_map.json" files + tokenizer_without_change_in_init = tokenizer_class.from_pretrained( + tmp_dir, + ) + self.assertIn( + "an_additional_special_token", tokenizer_without_change_in_init.additional_special_tokens + ) + self.assertEqual( + ["an_additional_special_token"], + tokenizer_without_change_in_init.convert_ids_to_tokens( + tokenizer_without_change_in_init.convert_tokens_to_ids(["an_additional_special_token"]) + ), + ) + + # Now we test that we can change the value of additional_special_tokens in the from_pretrained + new_added_tokens = added_tokens_extra_ids + [AddedToken("a_new_additional_special_token", lstrip=True)] + tokenizer = tokenizer_class.from_pretrained( + tmp_dir, + additional_special_tokens=new_added_tokens, + ) + + self.assertIn("a_new_additional_special_token", tokenizer.additional_special_tokens) + self.assertEqual( + ["a_new_additional_special_token"], + tokenizer.convert_ids_to_tokens( + tokenizer.convert_tokens_to_ids(["a_new_additional_special_token"]) + ), + ) + + def test_decode_invalid_byte_id(self): + tokenizer = self.perceiver_tokenizer + self.assertEqual(tokenizer.decode([178]), "�") + + # tokenizer can be instantiated without any pretrained files, so no need for pretrained tokenizer list + def test_pretrained_model_lists(self): + pass + + # tokenizer does not have vocabulary + def test_get_vocab(self): + pass + + # inputs cannot be pretokenized since ids depend on whole input string and not just on single characters + def test_pretokenized_inputs(self): + pass + + # tests all ids in vocab => vocab doesn't exist so unnecessary to test + def test_conversion_reversible(self): + pass diff --git a/utils/check_repo.py b/utils/check_repo.py index 2cb204313e..0522e900dc 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -102,6 +102,8 @@ TEST_FILES_WITH_NO_COMMON_TESTS = [ # should **not** be the rule. IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [ # models to ignore for model xxx mapping + "PerceiverForMultimodalAutoencoding", + "PerceiverForOpticalFlow", "SegformerDecodeHead", "SegformerForSemanticSegmentation", "BeitForSemanticSegmentation",