From f617490e718f5a5456764777f4960ea6c98cd1f1 Mon Sep 17 00:00:00 2001 From: abhishek thakur Date: Wed, 27 Jan 2021 09:20:09 +0100 Subject: [PATCH] ConvBERT Model (#9717) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * finalize convbert * finalize convbert * fix * fix * fix * push * fix * tf image patches * fix torch model * tf tests * conversion * everything aligned * remove print * tf tests * fix tf * make tf tests pass * everything works * fix init * fix * special treatment for sepconv1d * style * 🙏🏽 * add doc and cleanup * add electra test again * fix doc * fix doc again * fix doc again * Update src/transformers/modeling_tf_pytorch_utils.py Co-authored-by: Lysandre Debut * Update src/transformers/models/conv_bert/configuration_conv_bert.py Co-authored-by: Lysandre Debut * Update docs/source/model_doc/conv_bert.rst Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/auto/configuration_auto.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/conv_bert/configuration_conv_bert.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * conv_bert -> convbert * more fixes from review * add conversion script * dont use pretrained embed * unused config * suggestions from julien * some more fixes * p -> param * fix copyright * fix doc * Update src/transformers/models/convbert/configuration_convbert.py Co-authored-by: Patrick von Platen * comments from reviews * fix-copies * fix style * revert shape_list Co-authored-by: Lysandre Debut Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Patrick von Platen --- README.md | 1 + docs/source/index.rst | 70 +- docs/source/model_doc/convbert.rst | 143 ++ docs/source/model_summary.rst | 30 + src/transformers/__init__.py | 56 + src/transformers/convert_slow_tokenizer.py | 1 + src/transformers/modeling_tf_pytorch_utils.py | 5 +- src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 4 + src/transformers/models/auto/modeling_auto.py | 18 + .../models/auto/modeling_tf_auto.py | 16 + .../models/auto/tokenization_auto.py | 5 + src/transformers/models/convbert/__init__.py | 111 ++ .../models/convbert/configuration_convbert.py | 136 ++ ...bert_original_tf1_checkpoint_to_pytorch.py | 52 + .../models/convbert/modeling_convbert.py | 1307 ++++++++++++++ .../models/convbert/modeling_tf_convbert.py | 1514 +++++++++++++++++ .../models/convbert/tokenization_convbert.py | 56 + .../convbert/tokenization_convbert_fast.py | 61 + src/transformers/utils/dummy_pt_objects.py | 75 + src/transformers/utils/dummy_tf_objects.py | 71 + .../utils/dummy_tokenizers_objects.py | 9 + tests/test_modeling_convbert.py | 433 +++++ tests/test_modeling_tf_convbert.py | 394 +++++ utils/check_repo.py | 2 + 25 files changed, 4538 insertions(+), 33 deletions(-) create mode 100644 docs/source/model_doc/convbert.rst create mode 100644 src/transformers/models/convbert/__init__.py create mode 100644 src/transformers/models/convbert/configuration_convbert.py create mode 100644 src/transformers/models/convbert/convert_convbert_original_tf1_checkpoint_to_pytorch.py create mode 100755 src/transformers/models/convbert/modeling_convbert.py create mode 100644 src/transformers/models/convbert/modeling_tf_convbert.py create mode 100644 src/transformers/models/convbert/tokenization_convbert.py create mode 100644 src/transformers/models/convbert/tokenization_convbert_fast.py create mode 100644 tests/test_modeling_convbert.py create mode 100644 tests/test_modeling_tf_convbert.py diff --git a/README.md b/README.md index 2a3197f646..746e789c8f 100644 --- a/README.md +++ b/README.md @@ -197,6 +197,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[Blenderbot](https://huggingface.co/transformers/model_doc/blenderbot.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. 1. **[BlenderbotSmall](https://huggingface.co/transformers/model_doc/blenderbot_small.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. 1. **[CamemBERT](https://huggingface.co/transformers/model_doc/camembert.html)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot. +1. **[ConvBERT](https://huggingface.co/transformers/model_doc/convbert.html)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan. 1. **[CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher. 1. **[DeBERTa](https://huggingface.co/transformers/model_doc/deberta.html)** (from Microsoft Research) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. 1. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan. diff --git a/docs/source/index.rst b/docs/source/index.rst index ca9d86745b..3ac7e8c2af 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -106,97 +106,100 @@ and conversion utilities for the following models: 8. :doc:`CamemBERT ` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty French Language Model `__ by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot. -9. :doc:`CTRL ` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language - Model for Controllable Generation `__ by Nitish Shirish Keskar*, Bryan McCann*, - Lav R. Varshney, Caiming Xiong and Richard Socher. -10. :doc:`DeBERTa ` (from Microsoft Research) released with the paper `DeBERTa: Decoding-enhanced +9. :doc:`ConvBERT ` (from YituTech) released with the paper `ConvBERT: Improving BERT with + Span-based Dynamic Convolution `__ by Zihang Jiang, Weihao Yu, Daquan Zhou, + Yunpeng Chen, Jiashi Feng, Shuicheng Yan. +10. :doc:`CTRL ` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language + Model for Controllable Generation `__ by Nitish Shirish Keskar*, Bryan McCann*, + Lav R. Varshney, Caiming Xiong and Richard Socher. +11. :doc:`DeBERTa ` (from Microsoft Research) released with the paper `DeBERTa: Decoding-enhanced BERT with Disentangled Attention `__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. -11. :doc:`DialoGPT ` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale +12. :doc:`DialoGPT ` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation `__ by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan. -12. :doc:`DistilBERT ` (from HuggingFace), released together with the paper `DistilBERT, a +13. :doc:`DistilBERT ` (from HuggingFace), released together with the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter `__ by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2 `__, RoBERTa into `DistilRoBERTa `__, Multilingual BERT into `DistilmBERT `__ and a German version of DistilBERT. -13. :doc:`DPR ` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain +14. :doc:`DPR ` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain Question Answering `__ by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. -14. :doc:`ELECTRA ` (from Google Research/Stanford University) released with the paper `ELECTRA: +15. :doc:`ELECTRA ` (from Google Research/Stanford University) released with the paper `ELECTRA: Pre-training text encoders as discriminators rather than generators `__ by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. -15. :doc:`FlauBERT ` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model +16. :doc:`FlauBERT ` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model Pre-training for French `__ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. -16. :doc:`Funnel Transformer ` (from CMU/Google Brain) released with the paper `Funnel-Transformer: +17. :doc:`Funnel Transformer ` (from CMU/Google Brain) released with the paper `Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing `__ by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le. -17. :doc:`GPT ` (from OpenAI) released with the paper `Improving Language Understanding by Generative +18. :doc:`GPT ` (from OpenAI) released with the paper `Improving Language Understanding by Generative Pre-Training `__ by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. -18. :doc:`GPT-2 ` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask +19. :doc:`GPT-2 ` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask Learners `__ by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**. -19. :doc:`LayoutLM ` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training +20. :doc:`LayoutLM ` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training of Text and Layout for Document Image Understanding `__ by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou. -20. :doc:`LED ` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer +21. :doc:`LED ` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer `__ by Iz Beltagy, Matthew E. Peters, Arman Cohan. -21. :doc:`Longformer ` (from AllenAI) released with the paper `Longformer: The Long-Document +22. :doc:`Longformer ` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer `__ by Iz Beltagy, Matthew E. Peters, Arman Cohan. -22. :doc:`LXMERT ` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality +23. :doc:`LXMERT ` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering `__ by Hao Tan and Mohit Bansal. -23. :doc:`MarianMT ` Machine translation models trained using `OPUS `__ data by +24. :doc:`MarianMT ` Machine translation models trained using `OPUS `__ data by Jörg Tiedemann. The `Marian Framework `__ is being developed by the Microsoft Translator Team. -24. :doc:`MBart ` (from Facebook) released with the paper `Multilingual Denoising Pre-training for +25. :doc:`MBart ` (from Facebook) released with the paper `Multilingual Denoising Pre-training for Neural Machine Translation `__ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer. -25. :doc:`MPNet ` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted +26. :doc:`MPNet ` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted Pre-training for Language Understanding `__ by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. -26. :doc:`MT5 ` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained +27. :doc:`MT5 ` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained text-to-text transformer `__ by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. -27. :doc:`Pegasus ` (from Google) released with the paper `PEGASUS: Pre-training with Extracted +28. :doc:`Pegasus ` (from Google) released with the paper `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization `__> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. -28. :doc:`ProphetNet ` (from Microsoft Research) released with the paper `ProphetNet: Predicting +29. :doc:`ProphetNet ` (from Microsoft Research) released with the paper `ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training `__ by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. -29. :doc:`Reformer ` (from Google Research) released with the paper `Reformer: The Efficient +30. :doc:`Reformer ` (from Google Research) released with the paper `Reformer: The Efficient Transformer `__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. -30. :doc:`RoBERTa ` (from Facebook), released together with the paper a `Robustly Optimized BERT +31. :doc:`RoBERTa ` (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach `__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. ultilingual BERT into `DistilmBERT `__ and a German version of DistilBERT. -31. :doc:`SqueezeBert ` released with the paper `SqueezeBERT: What can computer vision teach NLP +32. :doc:`SqueezeBert ` released with the paper `SqueezeBERT: What can computer vision teach NLP about efficient neural networks? `__ by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer. -32. :doc:`T5 ` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a +33. :doc:`T5 ` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer `__ by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. -33. :doc:`TAPAS ` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via +34. :doc:`TAPAS ` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via Pre-training `__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. -34. :doc:`Transformer-XL ` (from Google/CMU) released with the paper `Transformer-XL: +35. :doc:`Transformer-XL ` (from Google/CMU) released with the paper `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context `__ by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov. -35. :doc:`XLM ` (from Facebook) released together with the paper `Cross-lingual Language Model +36. :doc:`XLM ` (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining `__ by Guillaume Lample and Alexis Conneau. -36. :doc:`XLM-ProphetNet ` (from Microsoft Research) released with the paper `ProphetNet: +37. :doc:`XLM-ProphetNet ` (from Microsoft Research) released with the paper `ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training `__ by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. -37. :doc:`XLM-RoBERTa ` (from Facebook AI), released together with the paper `Unsupervised +38. :doc:`XLM-RoBERTa ` (from Facebook AI), released together with the paper `Unsupervised Cross-lingual Representation Learning at Scale `__ by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. -38. :doc:`XLNet ` (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive +39. :doc:`XLNet ` (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive Pretraining for Language Understanding `__ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le. @@ -231,6 +234,8 @@ TensorFlow and/or Flax. +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ | CamemBERT | ✅ | ✅ | ✅ | ✅ | ❌ | +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +| ConvBERT | ✅ | ✅ | ✅ | ✅ | ❌ | ++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ | DPR | ✅ | ✅ | ✅ | ✅ | ❌ | +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ | DeBERTa | ✅ | ❌ | ✅ | ❌ | ❌ | @@ -371,6 +376,7 @@ TensorFlow and/or Flax. model_doc/blenderbot model_doc/blenderbot_small model_doc/camembert + model_doc/convbert model_doc/ctrl model_doc/deberta model_doc/dialogpt diff --git a/docs/source/model_doc/convbert.rst b/docs/source/model_doc/convbert.rst new file mode 100644 index 0000000000..ec2aefa6ad --- /dev/null +++ b/docs/source/model_doc/convbert.rst @@ -0,0 +1,143 @@ +.. + Copyright 2020 The HuggingFace Team. All rights reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + specific language governing permissions and limitations under the License. + +ConvBERT +----------------------------------------------------------------------------------------------------------------------- + +Overview +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ConvBERT model was proposed in `ConvBERT: Improving BERT with Span-based Dynamic Convolution +`__ by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng +Yan. + +The abstract from the paper is the following: + +*Pre-trained language models like BERT and its variants have recently achieved impressive performance in various +natural language understanding tasks. However, BERT heavily relies on the global self-attention block and thus suffers +large memory footprint and computation cost. Although all its attention heads query on the whole input sequence for +generating the attention map from a global perspective, we observe some heads only need to learn local dependencies, +which means the existence of computation redundancy. We therefore propose a novel span-based dynamic convolution to +replace these self-attention heads to directly model local dependencies. The novel convolution heads, together with the +rest self-attention heads, form a new mixed attention block that is more efficient at both global and local context +learning. We equip BERT with this mixed attention design and build a ConvBERT model. Experiments have shown that +ConvBERT significantly outperforms BERT and its variants in various downstream tasks, with lower training cost and +fewer model parameters. Remarkably, ConvBERTbase model achieves 86.4 GLUE score, 0.7 higher than ELECTRAbase, while +using less than 1/4 training cost. Code and pre-trained models will be released.* + +ConvBERT training tips are similar to those of BERT. The original implementation can be found here: +https://github.com/yitu-opensource/ConvBert + +ConvBertConfig +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.ConvBertConfig + :members: + + +ConvBertTokenizer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.ConvBertTokenizer + :members: build_inputs_with_special_tokens, get_special_tokens_mask, + create_token_type_ids_from_sequences, save_vocabulary + + +ConvBertTokenizerFast +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.ConvBertTokenizerFast + :members: build_inputs_with_special_tokens, get_special_tokens_mask, + create_token_type_ids_from_sequences, save_vocabulary + + +ConvBertModel +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.ConvBertModel + :members: forward + + +ConvBertForMaskedLM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.ConvBertForMaskedLM + :members: forward + + +ConvBertForSequenceClassification +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.ConvBertForSequenceClassification + :members: forward + + +ConvBertForMultipleChoice +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.ConvBertForMultipleChoice + :members: forward + + +ConvBertForTokenClassification +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.ConvBertForTokenClassification + :members: forward + + +ConvBertForQuestionAnswering +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.ConvBertForQuestionAnswering + :members: forward + + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFConvBertModel + :members: call + + +TFConvBertForMaskedLM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFConvBertForMaskedLM + :members: call + + +TFConvBertForSequenceClassification +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFConvBertForSequenceClassification + :members: call + + +TFConvBertForMultipleChoice +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFConvBertForMultipleChoice + :members: call + + +TFConvBertForTokenClassification +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFConvBertForTokenClassification + :members: call + + +TFConvBertForQuestionAnswering +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFConvBertForQuestionAnswering + :members: call diff --git a/docs/source/model_summary.rst b/docs/source/model_summary.rst index 5567c49e29..89eb45716d 100644 --- a/docs/source/model_summary.rst +++ b/docs/source/model_summary.rst @@ -330,6 +330,36 @@ the same probabilities as the larger model. The actual objective is a combinatio The library provides a version of the model for masked language modeling, token classification, sentence classification and question answering. +ConvBERT +----------------------------------------------------------------------------------------------------------------------- + +.. raw:: html + + + Models + + + Doc + + +`ConvBERT: Improving BERT with Span-based Dynamic Convolution `_, Zihang Jiang, +Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan. + +Pre-trained language models like BERT and its variants have recently achieved impressive performance in various natural +language understanding tasks. However, BERT heavily relies on the global self-attention block and thus suffers large +memory footprint and computation cost. Although all its attention heads query on the whole input sequence for +generating the attention map from a global perspective, we observe some heads only need to learn local dependencies, +which means the existence of computation redundancy. We therefore propose a novel span-based dynamic convolution to +replace these self-attention heads to directly model local dependencies. The novel convolution heads, together with the +rest self-attention heads, form a new mixed attention block that is more efficient at both global and local context +learning. We equip BERT with this mixed attention design and build a ConvBERT model. Experiments have shown that +ConvBERT significantly outperforms BERT and its variants in various downstream tasks, with lower training cost and +fewer model parameters. Remarkably, ConvBERTbase model achieves 86.4 GLUE score, 0.7 higher than ELECTRAbase, while +using less than 1/4 training cost. + +The library provides a version of the model for masked language modeling, token classification, sentence classification +and question answering. + XLM ----------------------------------------------------------------------------------------------------------------------- diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 22ba510f33..ad38122aa4 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -125,6 +125,7 @@ _import_structure = { ], "models": [], # Models + "models.convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertTokenizer"], "models.albert": ["ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "AlbertConfig"], "models.auto": [ "ALL_PRETRAINED_CONFIG_ARCHIVE_MAP", @@ -275,6 +276,7 @@ else: # tokenziers-backed objects if is_tokenizers_available(): # Fast tokenizers + _import_structure["models.convbert"].append("ConvBertTokenizerFast") _import_structure["models.albert"].append("AlbertTokenizerFast") _import_structure["models.bart"].append("BartTokenizerFast") _import_structure["models.barthez"].append("BarthezTokenizerFast") @@ -360,6 +362,21 @@ if is_torch_available(): _import_structure["generation_utils"] = ["top_k_top_p_filtering"] _import_structure["modeling_utils"] = ["Conv1D", "PreTrainedModel", "apply_chunking_to_forward", "prune_layer"] # PyTorch models structure + + _import_structure["models.convbert"].extend( + [ + "CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST", + "ConvBertForMaskedLM", + "ConvBertForMultipleChoice", + "ConvBertForQuestionAnswering", + "ConvBertForSequenceClassification", + "ConvBertForTokenClassification", + "ConvBertLayer", + "ConvBertModel", + "ConvBertPreTrainedModel", + "load_tf_weights_in_convbert", + ] + ) _import_structure["models.albert"].extend( [ "ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -816,6 +833,20 @@ if is_tf_available(): "shape_list", ] # TensorFlow models structure + + _import_structure["models.convbert"].extend( + [ + "TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST", + "TFConvBertForMaskedLM", + "TFConvBertForMultipleChoice", + "TFConvBertForQuestionAnswering", + "TFConvBertForSequenceClassification", + "TFConvBertForTokenClassification", + "TFConvBertLayer", + "TFConvBertModel", + "TFConvBertPreTrainedModel", + ] + ) _import_structure["models.albert"].extend( [ "TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -1235,6 +1266,7 @@ if TYPE_CHECKING: BlenderbotSmallTokenizer, ) from .models.camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig + from .models.convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvBertConfig, ConvBertTokenizer from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer from .models.deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig, DebertaTokenizer from .models.distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig, DistilBertTokenizer @@ -1359,6 +1391,7 @@ if TYPE_CHECKING: from .models.barthez import BarthezTokenizerFast from .models.bert import BertTokenizerFast from .models.camembert import CamembertTokenizerFast + from .models.convbert import ConvBertTokenizerFast from .models.distilbert import DistilBertTokenizerFast from .models.dpr import DPRContextEncoderTokenizerFast, DPRQuestionEncoderTokenizerFast, DPRReaderTokenizerFast from .models.electra import ElectraTokenizerFast @@ -1521,6 +1554,18 @@ if TYPE_CHECKING: CamembertForTokenClassification, CamembertModel, ) + from .models.convbert import ( + CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST, + ConvBertForMaskedLM, + ConvBertForMultipleChoice, + ConvBertForQuestionAnswering, + ConvBertForSequenceClassification, + ConvBertForTokenClassification, + ConvBertLayer, + ConvBertModel, + ConvBertPreTrainedModel, + load_tf_weights_in_convbert, + ) from .models.ctrl import ( CTRL_PRETRAINED_MODEL_ARCHIVE_LIST, CTRLForSequenceClassification, @@ -1879,6 +1924,17 @@ if TYPE_CHECKING: TFCamembertForTokenClassification, TFCamembertModel, ) + from .models.convbert import ( + TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST, + TFConvBertForMaskedLM, + TFConvBertForMultipleChoice, + TFConvBertForQuestionAnswering, + TFConvBertForSequenceClassification, + TFConvBertForTokenClassification, + TFConvBertLayer, + TFConvBertModel, + TFConvBertPreTrainedModel, + ) from .models.ctrl import ( TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST, TFCTRLForSequenceClassification, diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index 363026f6ab..ad301574f7 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -605,6 +605,7 @@ SLOW_TO_FAST_CONVERTERS = { "BarthezTokenizer": BarthezConverter, "BertTokenizer": BertConverter, "CamembertTokenizer": CamembertConverter, + "ConvBertTokenizer": BertConverter, "DistilBertTokenizer": BertConverter, "DPRReaderTokenizer": BertConverter, "DPRQuestionEncoderTokenizer": BertConverter, diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py index cdf979bf18..1e3c5e49fd 100644 --- a/src/transformers/modeling_tf_pytorch_utils.py +++ b/src/transformers/modeling_tf_pytorch_utils.py @@ -64,6 +64,10 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove="") if tf_name[-1] == "beta": tf_name[-1] = "bias" + # The SeparableConv1D TF layer contains two weights that are translated to PyTorch Conv1D here + if tf_name[-1] == "pointwise_kernel" or tf_name[-1] == "depthwise_kernel": + tf_name[-1] = tf_name[-1].replace("_kernel", ".weight") + # Remove prefix if needed tf_name = ".".join(tf_name) if start_prefix_to_remove: @@ -127,7 +131,6 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a if tf_inputs is not None: tf_model(tf_inputs, training=False) # Make sure model is built - # Adapt state dict - TODO remove this and update the AWS weights files instead # Convert old format to new format if needed from a PyTorch state_dict old_keys = [] diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 32fdd4c5e3..280406889a 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -28,6 +28,7 @@ from . import ( blenderbot, blenderbot_small, camembert, + convbert, ctrl, deberta, dialogpt, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 444c39921a..55e48c0cf2 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -28,6 +28,7 @@ from ..blenderbot_small.configuration_blenderbot_small import ( BlenderbotSmallConfig, ) from ..camembert.configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig +from ..convbert.configuration_convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvBertConfig from ..ctrl.configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig from ..deberta.configuration_deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig from ..distilbert.configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig @@ -71,6 +72,7 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict( (key, value) for pretrained_map in [ # Add archive maps here + CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, LED_PRETRAINED_CONFIG_ARCHIVE_MAP, BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, @@ -112,6 +114,7 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict( CONFIG_MAPPING = OrderedDict( [ # Add configs here + ("convbert", ConvBertConfig), ("led", LEDConfig), ("blenderbot-small", BlenderbotSmallConfig), ("retribert", RetriBertConfig), @@ -159,6 +162,7 @@ CONFIG_MAPPING = OrderedDict( MODEL_NAMES_MAPPING = OrderedDict( [ # Add full (and cased) model names here + ("convbert", "ConvBERT"), ("led", "LED"), ("blenderbot-small", "BlenderbotSmall"), ("retribert", "RetriBERT"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index dd37b859a9..993beb1991 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -61,6 +61,16 @@ from ..camembert.modeling_camembert import ( CamembertForTokenClassification, CamembertModel, ) + +# Add modeling imports here +from ..convbert.modeling_convbert import ( + ConvBertForMaskedLM, + ConvBertForMultipleChoice, + ConvBertForQuestionAnswering, + ConvBertForSequenceClassification, + ConvBertForTokenClassification, + ConvBertModel, +) from ..ctrl.modeling_ctrl import CTRLForSequenceClassification, CTRLLMHeadModel, CTRLModel from ..deberta.modeling_deberta import ( DebertaForMaskedLM, @@ -234,6 +244,7 @@ from .configuration_auto import ( BlenderbotConfig, BlenderbotSmallConfig, CamembertConfig, + ConvBertConfig, CTRLConfig, DebertaConfig, DistilBertConfig, @@ -277,6 +288,7 @@ logger = logging.get_logger(__name__) MODEL_MAPPING = OrderedDict( [ # Base model mapping + (ConvBertConfig, ConvBertModel), (LEDConfig, LEDModel), (BlenderbotSmallConfig, BlenderbotSmallModel), (RetriBertConfig, RetriBertModel), @@ -355,6 +367,7 @@ MODEL_FOR_PRETRAINING_MAPPING = OrderedDict( MODEL_WITH_LM_HEAD_MAPPING = OrderedDict( [ # Model with LM heads mapping + (ConvBertConfig, ConvBertForMaskedLM), (LEDConfig, LEDForConditionalGeneration), (BlenderbotSmallConfig, BlenderbotSmallForConditionalGeneration), (LayoutLMConfig, LayoutLMForMaskedLM), @@ -414,6 +427,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING = OrderedDict( MODEL_FOR_MASKED_LM_MAPPING = OrderedDict( [ # Model for Masked LM mapping + (ConvBertConfig, ConvBertForMaskedLM), (LayoutLMConfig, LayoutLMForMaskedLM), (DistilBertConfig, DistilBertForMaskedLM), (AlbertConfig, AlbertForMaskedLM), @@ -459,6 +473,7 @@ MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict( MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict( [ # Model for Sequence Classification mapping + (ConvBertConfig, ConvBertForSequenceClassification), (LEDConfig, LEDForSequenceClassification), (DistilBertConfig, DistilBertForSequenceClassification), (AlbertConfig, AlbertForSequenceClassification), @@ -491,6 +506,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict( MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict( [ # Model for Question Answering mapping + (ConvBertConfig, ConvBertForQuestionAnswering), (LEDConfig, LEDForQuestionAnswering), (DistilBertConfig, DistilBertForQuestionAnswering), (AlbertConfig, AlbertForQuestionAnswering), @@ -525,6 +541,7 @@ MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = OrderedDict( MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict( [ # Model for Token Classification mapping + (ConvBertConfig, ConvBertForTokenClassification), (LayoutLMConfig, LayoutLMForTokenClassification), (DistilBertConfig, DistilBertForTokenClassification), (CamembertConfig, CamembertForTokenClassification), @@ -549,6 +566,7 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict( MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict( [ # Model for Multiple Choice mapping + (ConvBertConfig, ConvBertForMultipleChoice), (CamembertConfig, CamembertForMultipleChoice), (ElectraConfig, ElectraForMultipleChoice), (XLMRobertaConfig, XLMRobertaForMultipleChoice), diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py index 1fadbfcabe..b127ecda0d 100644 --- a/src/transformers/models/auto/modeling_tf_auto.py +++ b/src/transformers/models/auto/modeling_tf_auto.py @@ -57,6 +57,14 @@ from ..camembert.modeling_tf_camembert import ( TFCamembertForTokenClassification, TFCamembertModel, ) +from ..convbert.modeling_tf_convbert import ( + TFConvBertForMaskedLM, + TFConvBertForMultipleChoice, + TFConvBertForQuestionAnswering, + TFConvBertForSequenceClassification, + TFConvBertForTokenClassification, + TFConvBertModel, +) from ..ctrl.modeling_tf_ctrl import TFCTRLForSequenceClassification, TFCTRLLMHeadModel, TFCTRLModel from ..distilbert.modeling_tf_distilbert import ( TFDistilBertForMaskedLM, @@ -173,6 +181,7 @@ from .configuration_auto import ( BlenderbotConfig, BlenderbotSmallConfig, CamembertConfig, + ConvBertConfig, CTRLConfig, DistilBertConfig, DPRConfig, @@ -206,6 +215,7 @@ logger = logging.get_logger(__name__) TF_MODEL_MAPPING = OrderedDict( [ # Base model mapping + (ConvBertConfig, TFConvBertModel), (LEDConfig, TFLEDModel), (LxmertConfig, TFLxmertModel), (MT5Config, TFMT5Model), @@ -268,6 +278,7 @@ TF_MODEL_FOR_PRETRAINING_MAPPING = OrderedDict( TF_MODEL_WITH_LM_HEAD_MAPPING = OrderedDict( [ # Model with LM heads mapping + (ConvBertConfig, TFConvBertForMaskedLM), (LEDConfig, TFLEDForConditionalGeneration), (T5Config, TFT5ForConditionalGeneration), (DistilBertConfig, TFDistilBertForMaskedLM), @@ -312,6 +323,7 @@ TF_MODEL_FOR_CAUSAL_LM_MAPPING = OrderedDict( TF_MODEL_FOR_MASKED_LM_MAPPING = OrderedDict( [ # Model for Masked LM mapping + (ConvBertConfig, TFConvBertForMaskedLM), (DistilBertConfig, TFDistilBertForMaskedLM), (AlbertConfig, TFAlbertForMaskedLM), (CamembertConfig, TFCamembertForMaskedLM), @@ -347,6 +359,7 @@ TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict( TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict( [ # Model for Sequence Classification mapping + (ConvBertConfig, TFConvBertForSequenceClassification), (DistilBertConfig, TFDistilBertForSequenceClassification), (AlbertConfig, TFAlbertForSequenceClassification), (CamembertConfig, TFCamembertForSequenceClassification), @@ -371,6 +384,7 @@ TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict( TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict( [ # Model for Question Answering mapping + (ConvBertConfig, TFConvBertForQuestionAnswering), (DistilBertConfig, TFDistilBertForQuestionAnswering), (AlbertConfig, TFAlbertForQuestionAnswering), (CamembertConfig, TFCamembertForQuestionAnswering), @@ -391,6 +405,7 @@ TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict( TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict( [ # Model for Token Classification mapping + (ConvBertConfig, TFConvBertForTokenClassification), (DistilBertConfig, TFDistilBertForTokenClassification), (AlbertConfig, TFAlbertForTokenClassification), (CamembertConfig, TFCamembertForTokenClassification), @@ -411,6 +426,7 @@ TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict( TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict( [ # Model for Multiple Choice mapping + (ConvBertConfig, TFConvBertForMultipleChoice), (CamembertConfig, TFCamembertForMultipleChoice), (XLMConfig, TFXLMForMultipleChoice), (XLMRobertaConfig, TFXLMRobertaForMultipleChoice), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index c3021ec0b8..55f7a42223 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -26,6 +26,7 @@ from ..bert_japanese.tokenization_bert_japanese import BertJapaneseTokenizer from ..bertweet.tokenization_bertweet import BertweetTokenizer from ..blenderbot.tokenization_blenderbot import BlenderbotTokenizer from ..blenderbot_small.tokenization_blenderbot_small import BlenderbotSmallTokenizer +from ..convbert.tokenization_convbert import ConvBertTokenizer from ..ctrl.tokenization_ctrl import CTRLTokenizer from ..deberta.tokenization_deberta import DebertaTokenizer from ..distilbert.tokenization_distilbert import DistilBertTokenizer @@ -61,6 +62,7 @@ from .configuration_auto import ( BlenderbotConfig, BlenderbotSmallConfig, CamembertConfig, + ConvBertConfig, CTRLConfig, DebertaConfig, DistilBertConfig, @@ -134,6 +136,7 @@ if is_tokenizers_available(): from ..barthez.tokenization_barthez_fast import BarthezTokenizerFast from ..bert.tokenization_bert_fast import BertTokenizerFast from ..camembert.tokenization_camembert_fast import CamembertTokenizerFast + from ..convbert.tokenization_convbert_fast import ConvBertTokenizerFast from ..distilbert.tokenization_distilbert_fast import DistilBertTokenizerFast from ..dpr.tokenization_dpr_fast import DPRQuestionEncoderTokenizerFast from ..electra.tokenization_electra_fast import ElectraTokenizerFast @@ -163,6 +166,7 @@ else: BarthezTokenizerFast = None BertTokenizerFast = None CamembertTokenizerFast = None + ConvBertTokenizerFast = None DistilBertTokenizerFast = None DPRQuestionEncoderTokenizerFast = None ElectraTokenizerFast = None @@ -233,6 +237,7 @@ TOKENIZER_MAPPING = OrderedDict( (MPNetConfig, (MPNetTokenizer, MPNetTokenizerFast)), (TapasConfig, (TapasTokenizer, None)), (LEDConfig, (LEDTokenizer, LEDTokenizerFast)), + (ConvBertConfig, (ConvBertTokenizer, ConvBertTokenizerFast)), ] ) diff --git a/src/transformers/models/convbert/__init__.py b/src/transformers/models/convbert/__init__.py new file mode 100644 index 0000000000..3fc591b361 --- /dev/null +++ b/src/transformers/models/convbert/__init__.py @@ -0,0 +1,111 @@ +# flake8: noqa +# There's no way to ignore "F401 '...' imported but unused" warnings in this +# module, but to preserve other warnings. So, don't check this module at all. + +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...file_utils import _BaseLazyModule, is_tf_available, is_tokenizers_available, is_torch_available + + +_import_structure = { + "configuration_convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig"], + "tokenization_convbert": ["ConvBertTokenizer"], +} + +if is_tokenizers_available(): + _import_structure["tokenization_convbert_fast"] = ["ConvBertTokenizerFast"] + +if is_torch_available(): + _import_structure["modeling_convbert"] = [ + "CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST", + "ConvBertForMaskedLM", + "ConvBertForMultipleChoice", + "ConvBertForQuestionAnswering", + "ConvBertForSequenceClassification", + "ConvBertForTokenClassification", + "ConvBertLayer", + "ConvBertModel", + "ConvBertPreTrainedModel", + "load_tf_weights_in_convbert", + ] + + +if is_tf_available(): + _import_structure["modeling_tf_convbert"] = [ + "TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST", + "TFConvBertForMaskedLM", + "TFConvBertForMultipleChoice", + "TFConvBertForQuestionAnswering", + "TFConvBertForSequenceClassification", + "TFConvBertForTokenClassification", + "TFConvBertLayer", + "TFConvBertModel", + "TFConvBertPreTrainedModel", + ] + + +if TYPE_CHECKING: + from .configuration_convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvBertConfig + from .tokenization_convbert import ConvBertTokenizer + + if is_tokenizers_available(): + from .tokenization_convbert_fast import ConvBertTokenizerFast + + if is_torch_available(): + from .modeling_convbert import ( + CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST, + ConvBertForMaskedLM, + ConvBertForMultipleChoice, + ConvBertForQuestionAnswering, + ConvBertForSequenceClassification, + ConvBertForTokenClassification, + ConvBertLayer, + ConvBertModel, + ConvBertPreTrainedModel, + load_tf_weights_in_convbert, + ) + + if is_tf_available(): + from .modeling_tf_convbert import ( + TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST, + TFConvBertForMaskedLM, + TFConvBertForMultipleChoice, + TFConvBertForQuestionAnswering, + TFConvBertForSequenceClassification, + TFConvBertForTokenClassification, + TFConvBertLayer, + TFConvBertModel, + TFConvBertPreTrainedModel, + ) + + +else: + import importlib + import os + import sys + + class _LazyModule(_BaseLazyModule): + """ + Module class that surfaces all objects but only performs associated imports when the objects are requested. + """ + + __file__ = globals()["__file__"] + __path__ = [os.path.dirname(__file__)] + + def _get_module(self, module_name: str): + return importlib.import_module("." + module_name, self.__name__) + + sys.modules[__name__] = _LazyModule(__name__, _import_structure) diff --git a/src/transformers/models/convbert/configuration_convbert.py b/src/transformers/models/convbert/configuration_convbert.py new file mode 100644 index 0000000000..ef4df0ee56 --- /dev/null +++ b/src/transformers/models/convbert/configuration_convbert.py @@ -0,0 +1,136 @@ +# coding=utf-8 +# Copyright The HuggingFace team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" ConvBERT model configuration """ + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + +CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "YituTech/conv-bert-base": "https://huggingface.co/YituTech/conv-bert-base/resolve/main/config.json", + "YituTech/conv-bert-medium-small": "https://huggingface.co/YituTech/conv-bert-medium-small/resolve/main/config.json", + "YituTech/conv-bert-small": "https://huggingface.co/YituTech/conv-bert-small/resolve/main/config.json", + # See all ConvBERT models at https://huggingface.co/models?filter=convbert +} + + +class ConvBertConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a :class:`~transformers.ConvBertModel`. It is used to + instantiate an ConvBERT model according to the specified arguments, defining the model architecture. Instantiating + a configuration with the defaults will yield a similar configuration to that of the ConvBERT `conv-bert-base + `__ architecture. Configuration objects inherit from + :class:`~transformers.PretrainedConfig` and can be used to control the model outputs. Read the documentation from + :class:`~transformers.PretrainedConfig` for more information. + + Args: + vocab_size (:obj:`int`, `optional`, defaults to 30522): + Vocabulary size of the ConvBERT model. Defines the number of different tokens that can be represented by + the :obj:`inputs_ids` passed when calling :class:`~transformers.ConvBertModel` or + :class:`~transformers.TFConvBertModel`. + hidden_size (:obj:`int`, `optional`, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (:obj:`int`, `optional`, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (:obj:`int`, `optional`, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (:obj:`int`, `optional`, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported. + hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): + The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (:obj:`int`, `optional`, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (:obj:`int`, `optional`, defaults to 2): + The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.ConvBertModel` + or :class:`~transformers.TFConvBertModel`. + initializer_range (:obj:`float`, `optional`, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12): + The epsilon used by the layer normalization layers. + head_ratio (:obj:`int`, `optional`, defaults to 2): + Ratio gamma to reduce the number of attention heads. + num_groups (:obj:`int`, `optional`, defaults to 1): + The number of groups for grouped linear layers for ConvBert model + conv_kernel_size (:obj:`int`, `optional`, defaults to 9): + The size of the convolutional kernel. + + + Example:: + >>> from transformers import ConvBertModel, ConvBertConfig + >>> # Initializing a ConvBERT convbert-base-uncased style configuration + >>> configuration = ConvBertConfig() + >>> # Initializing a model from the convbert-base-uncased style configuration + >>> model = ConvBertModel(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + """ + model_type = "convbert" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + is_encoder_decoder=False, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + embedding_size=768, + head_ratio=2, + conv_kernel_size=9, + num_groups=1, + **kwargs, + ): + super().__init__( + pad_token_id=pad_token_id, + is_encoder_decoder=is_encoder_decoder, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs, + ) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.embedding_size = embedding_size + self.head_ratio = head_ratio + self.conv_kernel_size = conv_kernel_size + self.num_groups = num_groups diff --git a/src/transformers/models/convbert/convert_convbert_original_tf1_checkpoint_to_pytorch.py b/src/transformers/models/convbert/convert_convbert_original_tf1_checkpoint_to_pytorch.py new file mode 100644 index 0000000000..7bb8a6ce51 --- /dev/null +++ b/src/transformers/models/convbert/convert_convbert_original_tf1_checkpoint_to_pytorch.py @@ -0,0 +1,52 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert ConvBERT checkpoint.""" + +import argparse + +from ...utils import logging +from .modeling_convbert import ConvBertConfig, ConvBertModel, load_tf_weights_in_convbert + + +logging.set_verbosity_info() + + +def convert_orig_tf1_checkpoint_to_pytorch(tf_checkpoint_path, convbert_config_file, pytorch_dump_path): + conf = ConvBertConfig.from_json_file(convbert_config_file) + model = ConvBertModel(conf) + + model = load_tf_weights_in_convbert(model, conf, tf_checkpoint_path) + model.save_pretrained(pytorch_dump_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." + ) + parser.add_argument( + "--convbert_config_file", + default=None, + type=str, + required=True, + help="The config json file corresponding to the pre-trained ConvBERT model. \n" + "This specifies the model architecture.", + ) + parser.add_argument( + "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." + ) + args = parser.parse_args() + convert_orig_tf1_checkpoint_to_pytorch(args.tf_checkpoint_path, args.conv_bert_config_file, args.pytorch_dump_path) diff --git a/src/transformers/models/convbert/modeling_convbert.py b/src/transformers/models/convbert/modeling_convbert.py new file mode 100755 index 0000000000..ccc352361c --- /dev/null +++ b/src/transformers/models/convbert/modeling_convbert.py @@ -0,0 +1,1307 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch ConvBERT model. """ + + +import math +import os +from operator import attrgetter + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN, get_activation +from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward +from ...modeling_outputs import ( + BaseModelOutputWithCrossAttentions, + MaskedLMOutput, + MultipleChoiceModelOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ...modeling_utils import ( + PreTrainedModel, + SequenceSummary, + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + prune_linear_layer, +) +from ...utils import logging +from .configuration_convbert import ConvBertConfig + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "ConvBertConfig" +_TOKENIZER_FOR_DOC = "ConvBertTokenizer" + +CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "YituTech/conv-bert-base", + "YituTech/conv-bert-medium-small", + "YituTech/conv-bert-small", + # See all ConvBERT models at https://huggingface.co/models?filter=convbert +] + + +def load_tf_weights_in_convbert(model, config, tf_checkpoint_path): + """Load tf checkpoints in a pytorch model.""" + try: + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + tf_data = {} + for name, shape in init_vars: + logger.info("Loading TF weight {} with shape {}".format(name, shape)) + array = tf.train.load_variable(tf_path, name) + tf_data[name] = array + + param_mapping = { + "embeddings.word_embeddings.weight": "electra/embeddings/word_embeddings", + "embeddings.position_embeddings.weight": "electra/embeddings/position_embeddings", + "embeddings.token_type_embeddings.weight": "electra/embeddings/token_type_embeddings", + "embeddings.LayerNorm.weight": "electra/embeddings/LayerNorm/gamma", + "embeddings.LayerNorm.bias": "electra/embeddings/LayerNorm/beta", + "embeddings_project.weight": "electra/embeddings_project/kernel", + "embeddings_project.bias": "electra/embeddings_project/bias", + } + if config.num_groups > 1: + group_dense_name = "g_dense" + else: + group_dense_name = "dense" + + for j in range(config.num_hidden_layers): + param_mapping[ + f"encoder.layer.{j}.attention.self.query.weight" + ] = f"electra/encoder/layer_{j}/attention/self/query/kernel" + param_mapping[ + f"encoder.layer.{j}.attention.self.query.bias" + ] = f"electra/encoder/layer_{j}/attention/self/query/bias" + param_mapping[ + f"encoder.layer.{j}.attention.self.key.weight" + ] = f"electra/encoder/layer_{j}/attention/self/key/kernel" + param_mapping[ + f"encoder.layer.{j}.attention.self.key.bias" + ] = f"electra/encoder/layer_{j}/attention/self/key/bias" + param_mapping[ + f"encoder.layer.{j}.attention.self.value.weight" + ] = f"electra/encoder/layer_{j}/attention/self/value/kernel" + param_mapping[ + f"encoder.layer.{j}.attention.self.value.bias" + ] = f"electra/encoder/layer_{j}/attention/self/value/bias" + param_mapping[ + f"encoder.layer.{j}.attention.self.key_conv_attn_layer.depthwise.weight" + ] = f"electra/encoder/layer_{j}/attention/self/conv_attn_key/depthwise_kernel" + param_mapping[ + f"encoder.layer.{j}.attention.self.key_conv_attn_layer.pointwise.weight" + ] = f"electra/encoder/layer_{j}/attention/self/conv_attn_key/pointwise_kernel" + param_mapping[ + f"encoder.layer.{j}.attention.self.key_conv_attn_layer.bias" + ] = f"electra/encoder/layer_{j}/attention/self/conv_attn_key/bias" + param_mapping[ + f"encoder.layer.{j}.attention.self.conv_kernel_layer.weight" + ] = f"electra/encoder/layer_{j}/attention/self/conv_attn_kernel/kernel" + param_mapping[ + f"encoder.layer.{j}.attention.self.conv_kernel_layer.bias" + ] = f"electra/encoder/layer_{j}/attention/self/conv_attn_kernel/bias" + param_mapping[ + f"encoder.layer.{j}.attention.self.conv_out_layer.weight" + ] = f"electra/encoder/layer_{j}/attention/self/conv_attn_point/kernel" + param_mapping[ + f"encoder.layer.{j}.attention.self.conv_out_layer.bias" + ] = f"electra/encoder/layer_{j}/attention/self/conv_attn_point/bias" + param_mapping[ + f"encoder.layer.{j}.attention.output.dense.weight" + ] = f"electra/encoder/layer_{j}/attention/output/dense/kernel" + param_mapping[ + f"encoder.layer.{j}.attention.output.LayerNorm.weight" + ] = f"electra/encoder/layer_{j}/attention/output/LayerNorm/gamma" + param_mapping[ + f"encoder.layer.{j}.attention.output.dense.bias" + ] = f"electra/encoder/layer_{j}/attention/output/dense/bias" + param_mapping[ + f"encoder.layer.{j}.attention.output.LayerNorm.bias" + ] = f"electra/encoder/layer_{j}/attention/output/LayerNorm/beta" + param_mapping[ + f"encoder.layer.{j}.intermediate.dense.weight" + ] = f"electra/encoder/layer_{j}/intermediate/{group_dense_name}/kernel" + param_mapping[ + f"encoder.layer.{j}.intermediate.dense.bias" + ] = f"electra/encoder/layer_{j}/intermediate/{group_dense_name}/bias" + param_mapping[ + f"encoder.layer.{j}.output.dense.weight" + ] = f"electra/encoder/layer_{j}/output/{group_dense_name}/kernel" + param_mapping[ + f"encoder.layer.{j}.output.dense.bias" + ] = f"electra/encoder/layer_{j}/output/{group_dense_name}/bias" + param_mapping[ + f"encoder.layer.{j}.output.LayerNorm.weight" + ] = f"electra/encoder/layer_{j}/output/LayerNorm/gamma" + param_mapping[f"encoder.layer.{j}.output.LayerNorm.bias"] = f"electra/encoder/layer_{j}/output/LayerNorm/beta" + + for param in model.named_parameters(): + param_name = param[0] + retriever = attrgetter(param_name) + result = retriever(model) + tf_name = param_mapping[param_name] + value = torch.from_numpy(tf_data[tf_name]) + logger.info(f"TF: {tf_name}, PT: {param_name} ") + if tf_name.endswith("/kernel"): + if not tf_name.endswith("/intermediate/g_dense/kernel"): + if not tf_name.endswith("/output/g_dense/kernel"): + value = value.T + if tf_name.endswith("/depthwise_kernel"): + value = value.permute(1, 2, 0) # 2, 0, 1 + if tf_name.endswith("/pointwise_kernel"): + value = value.permute(2, 1, 0) # 2, 1, 0 + if tf_name.endswith("/conv_attn_key/bias"): + value = value.unsqueeze(-1) + result.data = value + return model + + +class ConvBertEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + + def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None): + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + position_embeddings = self.position_embeddings(position_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + position_embeddings + token_type_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class ConvBertPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = ConvBertConfig + load_tf_weights = load_tf_weights_in_convbert + base_model_prefix = "convbert" + authorized_missing_keys = [r"position_ids"] + authorized_unexpected_keys = [r"convbert\.embeddings_project\.weight", r"convbert\.embeddings_project\.bias"] + + def _init_weights(self, module): + """ Initialize the weights """ + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, (nn.Linear)) and module.bias is not None: + module.bias.data.zero_() + + +class SeparableConv1D(nn.Module): + """This class implements separable convolution, i.e. a depthwise and a pointwise layer""" + + def __init__(self, config, input_filters, output_filters, kernel_size, **kwargs): + super().__init__() + self.depthwise = nn.Conv1d( + input_filters, + input_filters, + kernel_size=kernel_size, + groups=input_filters, + padding=kernel_size // 2, + bias=False, + ) + self.pointwise = nn.Conv1d(input_filters, output_filters, kernel_size=1, bias=False) + self.bias = nn.Parameter(torch.zeros(output_filters, 1)) + + self.depthwise.weight.data.normal_(mean=0.0, std=config.initializer_range) + self.pointwise.weight.data.normal_(mean=0.0, std=config.initializer_range) + + def forward(self, hidden_states): + x = self.depthwise(hidden_states) + x = self.pointwise(x) + x += self.bias + return x + + +class ConvBertSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (config.hidden_size, config.num_attention_heads) + ) + + new_num_attention_heads = config.num_attention_heads // config.head_ratio + if new_num_attention_heads < 1: + self.head_ratio = config.num_attention_heads + self.num_attention_heads = 1 + else: + self.num_attention_heads = new_num_attention_heads + self.head_ratio = config.head_ratio + + self.conv_kernel_size = config.conv_kernel_size + assert ( + config.hidden_size % self.num_attention_heads == 0 + ), "hidden_size should be divisible by num_attention_heads" + + self.attention_head_size = config.hidden_size // config.num_attention_heads + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.key_conv_attn_layer = SeparableConv1D( + config, config.hidden_size, self.all_head_size, self.conv_kernel_size + ) + self.conv_kernel_layer = nn.Linear(self.all_head_size, self.num_attention_heads * self.conv_kernel_size) + self.conv_out_layer = nn.Linear(config.hidden_size, self.all_head_size) + + self.unfold = nn.Unfold( + kernel_size=[self.conv_kernel_size, 1], padding=[int((self.conv_kernel_size - 1) / 2), 0] + ) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + output_attentions=False, + ): + mixed_query_layer = self.query(hidden_states) + batch_size = hidden_states.size(0) + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + if encoder_hidden_states is not None: + mixed_key_layer = self.key(encoder_hidden_states) + mixed_value_layer = self.value(encoder_hidden_states) + else: + mixed_key_layer = self.key(hidden_states) + mixed_value_layer = self.value(hidden_states) + + mixed_key_conv_attn_layer = self.key_conv_attn_layer(hidden_states.transpose(1, 2)) + mixed_key_conv_attn_layer = mixed_key_conv_attn_layer.transpose(1, 2) + + query_layer = self.transpose_for_scores(mixed_query_layer) + key_layer = self.transpose_for_scores(mixed_key_layer) + value_layer = self.transpose_for_scores(mixed_value_layer) + conv_attn_layer = torch.multiply(mixed_key_conv_attn_layer, mixed_query_layer) + + conv_kernel_layer = self.conv_kernel_layer(conv_attn_layer) + conv_kernel_layer = torch.reshape(conv_kernel_layer, [-1, self.conv_kernel_size, 1]) + conv_kernel_layer = torch.softmax(conv_kernel_layer, dim=1) + + conv_out_layer = self.conv_out_layer(hidden_states) + conv_out_layer = torch.reshape(conv_out_layer, [batch_size, -1, self.all_head_size]) + conv_out_layer = conv_out_layer.transpose(1, 2).contiguous().unsqueeze(-1) + conv_out_layer = nn.functional.unfold( + conv_out_layer, + kernel_size=[self.conv_kernel_size, 1], + dilation=1, + padding=[(self.conv_kernel_size - 1) // 2, 0], + stride=1, + ) + conv_out_layer = conv_out_layer.transpose(1, 2).reshape( + batch_size, -1, self.all_head_size, self.conv_kernel_size + ) + conv_out_layer = torch.reshape(conv_out_layer, [-1, self.attention_head_size, self.conv_kernel_size]) + conv_out_layer = torch.matmul(conv_out_layer, conv_kernel_layer) + conv_out_layer = torch.reshape(conv_out_layer, [-1, self.all_head_size]) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in ConvBertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = torch.nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + + conv_out = torch.reshape(conv_out_layer, [batch_size, -1, self.num_attention_heads, self.attention_head_size]) + context_layer = torch.cat([context_layer, conv_out], 2) + + new_context_layer_shape = context_layer.size()[:-2] + (self.head_ratio * self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + return outputs + + +class ConvBertSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class ConvBertAttention(nn.Module): + def __init__(self, config): + super().__init__() + self.self = ConvBertSelfAttention(config) + self.output = ConvBertSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + output_attentions=False, + ): + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class GroupedLinearLayer(nn.Module): + def __init__(self, input_size, output_size, num_groups): + super().__init__() + self.input_size = input_size + self.output_size = output_size + self.num_groups = num_groups + self.group_in_dim = self.input_size // self.num_groups + self.group_out_dim = self.output_size // self.num_groups + self.weight = nn.Parameter(torch.Tensor(self.num_groups, self.group_in_dim, self.group_out_dim)) + self.bias = nn.Parameter(torch.Tensor(output_size)) + + def forward(self, hidden_states): + batch_size = list(hidden_states.size())[0] + x = torch.reshape(hidden_states, [-1, self.num_groups, self.group_in_dim]) + x = x.permute(1, 0, 2) + x = torch.matmul(x, self.weight) + x = x.permute(1, 0, 2) + x = torch.reshape(x, [batch_size, -1, self.output_size]) + x = x + self.bias + return x + + +class ConvBertIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + if config.num_groups == 1: + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + else: + self.dense = GroupedLinearLayer( + input_size=config.hidden_size, output_size=config.intermediate_size, num_groups=config.num_groups + ) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class ConvBertOutput(nn.Module): + def __init__(self, config): + super().__init__() + if config.num_groups == 1: + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + else: + self.dense = GroupedLinearLayer( + input_size=config.intermediate_size, output_size=config.hidden_size, num_groups=config.num_groups + ) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class ConvBertLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = ConvBertAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added" + self.crossattention = ConvBertAttention(config) + self.intermediate = ConvBertIntermediate(config) + self.output = ConvBertOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + output_attentions=False, + ): + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + if self.is_decoder and encoder_hidden_states is not None: + assert hasattr( + self, "crossattention" + ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`" + cross_attention_outputs = self.crossattention( + attention_output, + encoder_attention_mask, + head_mask, + encoder_hidden_states, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class ConvBertEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([ConvBertLayer(config) for _ in range(config.num_hidden_layers)]) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + if getattr(self.config, "gradient_checkpointing", False): + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions, + ) + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [hidden_states, all_hidden_states, all_self_attentions, all_cross_attentions] + if v is not None + ) + return BaseModelOutputWithCrossAttentions( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class ConvBertPredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +CONVBERT_START_DOCSTRING = r""" + This model is a PyTorch `torch.nn.Module `_ sub-class. Use + it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config (:class:`~transformers.ConvBertConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model + weights. +""" + +CONVBERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using :class:`transformers.ConvBertTokenizer`. See + :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for + details. + + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, + 1]``: + + + - 0 corresponds to a `sentence A` token, + - 1 corresponds to a `sentence B` token. + + `What are token type IDs? <../glossary.html#token-type-ids>`_ + position_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, + config.max_position_embeddings - 1]``. + + `What are position IDs? <../glossary.html#position-ids>`_ + head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: + + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare ConvBERT Model transformer outputting raw hidden-states without any specific head on top.", + CONVBERT_START_DOCSTRING, +) +class ConvBertModel(ConvBertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.embeddings = ConvBertEmbeddings(config) + + if config.embedding_size != config.hidden_size: + self.embeddings_project = nn.Linear(config.embedding_size, config.hidden_size) + + self.encoder = ConvBertEncoder(config) + self.config = config + self.init_weights() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="YituTech/conv-bert-base", + output_type=BaseModelOutputWithCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if attention_mask is None: + attention_mask = torch.ones(input_shape, device=device) + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device) + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + hidden_states = self.embeddings( + input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds + ) + + if hasattr(self, "embeddings_project"): + hidden_states = self.embeddings_project(hidden_states) + + hidden_states = self.encoder( + hidden_states, + attention_mask=extended_attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + return hidden_states + + +class ConvBertGeneratorPredictions(nn.Module): + """Prediction module for the generator, made up of two dense layers.""" + + def __init__(self, config): + super().__init__() + + self.LayerNorm = nn.LayerNorm(config.embedding_size) + self.dense = nn.Linear(config.hidden_size, config.embedding_size) + + def forward(self, generator_hidden_states): + hidden_states = self.dense(generator_hidden_states) + hidden_states = get_activation("gelu")(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + + return hidden_states + + +@add_start_docstrings("""ConvBERT Model with a `language modeling` head on top. """, CONVBERT_START_DOCSTRING) +class ConvBertForMaskedLM(ConvBertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.convbert = ConvBertModel(config) + self.generator_predictions = ConvBertGeneratorPredictions(config) + + self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size) + self.init_weights() + + def get_output_embeddings(self): + return self.generator_lm_head + + def set_output_embeddings(self, word_embeddings): + self.generator_lm_head = word_embeddings + + @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="YituTech/conv-bert-base", + output_type=MaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., + config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored + (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + generator_hidden_states = self.convbert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + inputs_embeds, + output_attentions, + output_hidden_states, + return_dict, + ) + generator_sequence_output = generator_hidden_states[0] + + prediction_scores = self.generator_predictions(generator_sequence_output) + prediction_scores = self.generator_lm_head(prediction_scores) + + loss = None + # Masked language modeling softmax layer + if labels is not None: + loss_fct = nn.CrossEntropyLoss() # -100 index = padding token + loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + generator_hidden_states[1:] + return ((loss,) + output) if loss is not None else output + + return MaskedLMOutput( + loss=loss, + logits=prediction_scores, + hidden_states=generator_hidden_states.hidden_states, + attentions=generator_hidden_states.attentions, + ) + + +class ConvBertClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.out_proj = nn.Linear(config.hidden_size, config.num_labels) + + self.config = config + + def forward(self, hidden_states, **kwargs): + x = hidden_states[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x) + x = self.dense(x) + x = ACT2FN[self.config.hidden_act](x) + x = self.dropout(x) + x = self.out_proj(x) + return x + + +@add_start_docstrings( + """ + ConvBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """, + CONVBERT_START_DOCSTRING, +) +class ConvBertForSequenceClassification(ConvBertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.convbert = ConvBertModel(config) + self.classifier = ConvBertClassificationHead(config) + + self.init_weights() + + @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="YituTech/conv-bert-base", + output_type=SequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., + config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.convbert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + if self.num_labels == 1: + # We are doing regression + loss_fct = MSELoss() + loss = loss_fct(logits.view(-1), labels.view(-1)) + else: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + ConvBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + CONVBERT_START_DOCSTRING, +) +class ConvBertForMultipleChoice(ConvBertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.convbert = ConvBertModel(config) + self.sequence_summary = SequenceSummary(config) + self.classifier = nn.Linear(config.hidden_size, 1) + + self.init_weights() + + @add_start_docstrings_to_model_forward( + CONVBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") + ) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="YituTech/conv-bert-base", + output_type=MultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., + num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See + :obj:`input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.convbert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + pooled_output = self.sequence_summary(sequence_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + ConvBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + CONVBERT_START_DOCSTRING, +) +class ConvBertForTokenClassification(ConvBertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.convbert = ConvBertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="YituTech/conv-bert-base", + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - + 1]``. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.convbert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + # Only keep active parts of the loss + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels) + active_labels = torch.where( + active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) + ) + loss = loss_fct(active_logits, active_labels) + else: + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + ConvBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + CONVBERT_START_DOCSTRING, +) +class ConvBertForQuestionAnswering(ConvBertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.num_labels = config.num_labels + self.convbert = ConvBertModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="YituTech/conv-bert-base", + output_type=QuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + start_positions=None, + end_positions=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the + sequence are not taken into account for computing the loss. + end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the + sequence are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.convbert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[1:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/src/transformers/models/convbert/modeling_tf_convbert.py b/src/transformers/models/convbert/modeling_tf_convbert.py new file mode 100644 index 0000000000..650c13c129 --- /dev/null +++ b/src/transformers/models/convbert/modeling_tf_convbert.py @@ -0,0 +1,1514 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" TF 2.0 ConvBERT model. """ + + +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...file_utils import ( + MULTIPLE_CHOICE_DUMMY_INPUTS, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, +) +from ...modeling_tf_outputs import ( + TFBaseModelOutput, + TFMaskedLMOutput, + TFMultipleChoiceModelOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) +from ...modeling_tf_utils import ( + TFMaskedLanguageModelingLoss, + TFMultipleChoiceLoss, + TFPreTrainedModel, + TFQuestionAnsweringLoss, + TFSequenceClassificationLoss, + TFSequenceSummary, + TFTokenClassificationLoss, + get_initializer, + input_processing, + keras_serializable, + shape_list, +) +from ...utils import logging +from .configuration_convbert import ConvBertConfig + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "ConvBertConfig" +_TOKENIZER_FOR_DOC = "ConvBertTokenizer" + +TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "YituTech/conv-bert-base", + "YituTech/conv-bert-medium-small", + "YituTech/conv-bert-small", + # See all ConvBERT models at https://huggingface.co/models?filter=convbert +] + + +class TFConvBertWordEmbeddings(tf.keras.layers.Layer): + def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.weight = self.add_weight( + name="weight", + shape=[self.vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape=input_shape) + + def get_config(self): + config = { + "vocab_size": self.vocab_size, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, input_ids): + flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) + embeddings = tf.gather(params=self.weight, indices=flat_input_ids) + embeddings = tf.reshape( + tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + ) + + embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + + return embeddings + + +class TFConvBertTokenTypeEmbeddings(tf.keras.layers.Layer): + def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.type_vocab_size = type_vocab_size + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.type_vocab_size, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape=input_shape) + + def get_config(self): + config = { + "type_vocab_size": self.type_vocab_size, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, token_type_ids): + flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) + one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) + embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) + embeddings = tf.reshape( + tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) + ) + + embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) + + return embeddings + + +class TFConvBertPositionEmbeddings(tf.keras.layers.Layer): + def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_range: float, **kwargs): + super().__init__(**kwargs) + + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.initializer_range = initializer_range + + def build(self, input_shape): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + super().build(input_shape) + + def get_config(self): + config = { + "max_position_embeddings": self.max_position_embeddings, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, position_ids): + input_shape = shape_list(tensor=position_ids) + position_embeddings = self.position_embeddings[: input_shape[1], :] + + return tf.broadcast_to(input=position_embeddings, shape=input_shape) + + +class TFConvBertEmbeddings(tf.keras.layers.Layer): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.word_embeddings = TFConvBertWordEmbeddings( + vocab_size=config.vocab_size, + hidden_size=config.embedding_size, + initializer_range=config.initializer_range, + name="word_embeddings", + ) + self.position_embeddings = TFConvBertPositionEmbeddings( + max_position_embeddings=config.max_position_embeddings, + hidden_size=config.embedding_size, + initializer_range=config.initializer_range, + name="position_embeddings", + ) + self.token_type_embeddings = TFConvBertTokenTypeEmbeddings( + type_vocab_size=config.type_vocab_size, + hidden_size=config.embedding_size, + initializer_range=config.initializer_range, + name="token_type_embeddings", + ) + self.embeddings_sum = tf.keras.layers.Add() + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + + def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False): + """ + Applies embedding based on inputs tensor. + + Returns: + final_embeddings (:obj:`tf.Tensor`): output embedding tensor. + """ + assert not (input_ids is None and inputs_embeds is None) + + if input_ids is not None: + inputs_embeds = self.word_embeddings(input_ids=input_ids) + + if token_type_ids is None: + input_shape = shape_list(tensor=inputs_embeds)[:-1] + token_type_ids = tf.fill(dims=input_shape, value=0) + + if position_ids is None: + position_embeds = self.position_embeddings(position_ids=inputs_embeds) + else: + position_embeds = self.position_embeddings(position_ids=position_ids) + + token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids) + final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) + final_embeddings = self.LayerNorm(inputs=final_embeddings) + final_embeddings = self.dropout(inputs=final_embeddings, training=training) + + return final_embeddings + + +class TFConvBertSelfAttention(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (config.hidden_size, config.num_attention_heads) + ) + + new_num_attention_heads = int(config.num_attention_heads / config.head_ratio) + if new_num_attention_heads < 1: + self.head_ratio = config.num_attention_heads + num_attention_heads = 1 + else: + num_attention_heads = new_num_attention_heads + self.head_ratio = config.head_ratio + + self.num_attention_heads = num_attention_heads + self.conv_kernel_size = config.conv_kernel_size + + assert ( + config.hidden_size % self.num_attention_heads == 0 + ), "hidden_size should be divisible by num_attention_heads" + + self.attention_head_size = config.hidden_size // config.num_attention_heads + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.query = tf.keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" + ) + self.key = tf.keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" + ) + self.value = tf.keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" + ) + + self.key_conv_attn_layer = tf.keras.layers.SeparableConv1D( + self.all_head_size, + self.conv_kernel_size, + padding="same", + activation=None, + depthwise_initializer=get_initializer(1 / self.conv_kernel_size), + pointwise_initializer=get_initializer(config.initializer_range), + name="key_conv_attn_layer", + ) + + self.conv_kernel_layer = tf.keras.layers.Dense( + self.num_attention_heads * self.conv_kernel_size, + activation=None, + name="conv_kernel_layer", + kernel_initializer=get_initializer(config.initializer_range), + ) + + self.conv_out_layer = tf.keras.layers.Dense( + self.all_head_size, + activation=None, + name="conv_out_layer", + kernel_initializer=get_initializer(config.initializer_range), + ) + + self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x, batch_size): + x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size)) + return tf.transpose(x, perm=[0, 2, 1, 3]) + + def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False): + batch_size = shape_list(hidden_states)[0] + mixed_query_layer = self.query(hidden_states) + mixed_key_layer = self.key(hidden_states) + mixed_value_layer = self.value(hidden_states) + + mixed_key_conv_attn_layer = self.key_conv_attn_layer(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) + key_layer = self.transpose_for_scores(mixed_key_layer, batch_size) + conv_attn_layer = tf.multiply(mixed_key_conv_attn_layer, mixed_query_layer) + + conv_kernel_layer = self.conv_kernel_layer(conv_attn_layer) + conv_kernel_layer = tf.reshape(conv_kernel_layer, [-1, self.conv_kernel_size, 1]) + conv_kernel_layer = tf.nn.softmax(conv_kernel_layer, axis=1) + + conv_out_layer = self.conv_out_layer(hidden_states) + conv_out_layer = tf.reshape(conv_out_layer, [batch_size, -1, self.all_head_size]) + + conv_out_layer = tf.reshape( + conv_out_layer, [batch_size, shape_list(mixed_query_layer)[1], self.all_head_size, 1] + ) + unfold_conv_out_layer = tf.image.extract_patches( + images=conv_out_layer, + sizes=[1, self.conv_kernel_size, 1, 1], + strides=[1, 1, 1, 1], + rates=[1, 1, 1, 1], + padding="SAME", + ) + + conv_out_layer = tf.reshape(unfold_conv_out_layer, [-1, self.attention_head_size, self.conv_kernel_size]) + + conv_out_layer = tf.matmul(conv_out_layer, conv_kernel_layer) + conv_out_layer = tf.reshape(conv_out_layer, [-1, self.all_head_size]) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = tf.matmul( + query_layer, key_layer, transpose_b=True + ) # (batch size, num_heads, seq_len_q, seq_len_k) + dk = tf.cast(shape_list(key_layer)[-1], attention_scores.dtype) # scale attention_scores + attention_scores = attention_scores / tf.math.sqrt(dk) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in TFBertModel call() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = tf.nn.softmax(attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs, training=training) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + value_layer = tf.reshape( + mixed_value_layer, [batch_size, -1, self.num_attention_heads, self.attention_head_size] + ) + value_layer = tf.transpose(value_layer, [0, 2, 1, 3]) + + context_layer = tf.matmul(attention_probs, value_layer) + context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) + + conv_out = tf.reshape(conv_out_layer, [batch_size, -1, self.num_attention_heads, self.attention_head_size]) + context_layer = tf.concat([context_layer, conv_out], 2) + context_layer = tf.reshape( + context_layer, (batch_size, -1, self.head_ratio * self.all_head_size) + ) # (batch_size, seq_len_q, all_head_size) + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class TFConvBertSelfOutput(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.dense = tf.keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + + def call(self, hidden_states, input_tensor, training=False): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + + return hidden_states + + +class TFConvBertAttention(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.self_attention = TFConvBertSelfAttention(config, name="self") + self.dense_output = TFConvBertSelfOutput(config, name="output") + + def prune_heads(self, heads): + raise NotImplementedError + + def call(self, input_tensor, attention_mask, head_mask, output_attentions, training=False): + self_outputs = self.self_attention( + input_tensor, attention_mask, head_mask, output_attentions, training=training + ) + attention_output = self.dense_output(self_outputs[0], input_tensor, training=training) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + + return outputs + + +class GroupedLinearLayer(tf.keras.layers.Layer): + def __init__(self, input_size, output_size, num_groups, kernel_initializer, **kwargs): + super().__init__(**kwargs) + self.input_size = input_size + self.output_size = output_size + self.num_groups = num_groups + self.kernel_initializer = kernel_initializer + self.group_in_dim = self.input_size // self.num_groups + self.group_out_dim = self.output_size // self.num_groups + + def build(self, input_shape): + self.kernel = self.add_weight( + "kernel", + shape=[self.num_groups, self.group_in_dim, self.group_out_dim], + initializer=self.kernel_initializer, + trainable=True, + ) + + self.bias = self.add_weight( + "bias", shape=[self.output_size], initializer=self.kernel_initializer, dtype=self.dtype, trainable=True + ) + + def call(self, hidden_states): + batch_size = shape_list(tensor=hidden_states)[1] + x = tf.reshape(hidden_states, [-1, self.num_groups, self.group_in_dim]) + x = tf.matmul(a=x, b=self.kernel, transpose_b=True) + x = tf.reshape(x, [batch_size, -1, self.output_size]) + x = tf.nn.bias_add(value=x, bias=self.bias) + return x + + +class TFConvBertIntermediate(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + if config.num_groups == 1: + self.dense = tf.keras.layers.Dense( + config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + else: + self.dense = GroupedLinearLayer( + config.hidden_size, + config.intermediate_size, + num_groups=config.num_groups, + kernel_initializer=get_initializer(config.initializer_range), + name="dense", + ) + + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = get_tf_activation(config.hidden_act) + else: + self.intermediate_act_fn = config.hidden_act + + def call(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + + return hidden_states + + +class TFConvBertOutput(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + if config.num_groups == 1: + self.dense = tf.keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + else: + self.dense = GroupedLinearLayer( + config.intermediate_size, + config.hidden_size, + num_groups=config.num_groups, + kernel_initializer=get_initializer(config.initializer_range), + name="dense", + ) + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + + def call(self, hidden_states, input_tensor, training=False): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + + return hidden_states + + +class TFConvBertLayer(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.attention = TFConvBertAttention(config, name="attention") + self.intermediate = TFConvBertIntermediate(config, name="intermediate") + self.bert_output = TFConvBertOutput(config, name="output") + + def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False): + attention_outputs = self.attention( + hidden_states, attention_mask, head_mask, output_attentions, training=training + ) + attention_output = attention_outputs[0] + intermediate_output = self.intermediate(attention_output) + layer_output = self.bert_output(intermediate_output, attention_output, training=training) + outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them + + return outputs + + +class TFConvBertEncoder(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.layer = [TFConvBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] + + def call( + self, + hidden_states, + attention_mask, + head_mask, + output_attentions, + output_hidden_states, + return_dict, + training=False, + ): + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_outputs = layer_module( + hidden_states, attention_mask, head_mask[i], output_attentions, training=training + ) + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) + + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) + + +class TFConvBertPredictionHeadTransform(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.dense = tf.keras.layers.Dense( + config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + + if isinstance(config.hidden_act, str): + self.transform_act_fn = get_tf_activation(config.hidden_act) + else: + self.transform_act_fn = config.hidden_act + + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + + def call(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + + return hidden_states + + +@keras_serializable +class TFConvBertMainLayer(tf.keras.layers.Layer): + config_class = ConvBertConfig + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.embeddings = TFConvBertEmbeddings(config, name="embeddings") + + if config.embedding_size != config.hidden_size: + self.embeddings_project = tf.keras.layers.Dense(config.hidden_size, name="embeddings_project") + + self.encoder = TFConvBertEncoder(config, name="encoder") + self.config = config + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings.weight = value + self.embeddings.word_embeddings.vocab_size = value.shape[0] + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + raise NotImplementedError + + def get_extended_attention_mask(self, attention_mask, input_shape, dtype): + if attention_mask is None: + attention_mask = tf.fill(input_shape, 1) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :] + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = tf.cast(extended_attention_mask, dtype) + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + + return extended_attention_mask + + def get_head_mask(self, head_mask): + if head_mask is not None: + raise NotImplementedError + else: + head_mask = [None] * self.config.num_hidden_layers + + return head_mask + + def call( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + training=False, + **kwargs, + ): + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + kwargs_call=kwargs, + ) + + if inputs["input_ids"] is not None and inputs["inputs_embeds"] is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif inputs["input_ids"] is not None: + input_shape = shape_list(inputs["input_ids"]) + elif inputs["inputs_embeds"] is not None: + input_shape = shape_list(inputs["inputs_embeds"])[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs["attention_mask"] is None: + inputs["attention_mask"] = tf.fill(input_shape, 1) + + if inputs["token_type_ids"] is None: + inputs["token_type_ids"] = tf.fill(input_shape, 0) + + hidden_states = self.embeddings( + inputs["input_ids"], + inputs["position_ids"], + inputs["token_type_ids"], + inputs["inputs_embeds"], + training=inputs["training"], + ) + extended_attention_mask = self.get_extended_attention_mask( + inputs["attention_mask"], input_shape, hidden_states.dtype + ) + inputs["head_mask"] = self.get_head_mask(inputs["head_mask"]) + + if hasattr(self, "embeddings_project"): + hidden_states = self.embeddings_project(hidden_states, training=inputs["training"]) + + hidden_states = self.encoder( + hidden_states, + extended_attention_mask, + inputs["head_mask"], + inputs["output_attentions"], + inputs["output_hidden_states"], + inputs["return_dict"], + training=inputs["training"], + ) + + return hidden_states + + +class TFConvBertPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = ConvBertConfig + base_model_prefix = "convbert" + + +CONVBERT_START_DOCSTRING = r""" + + This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the + generic methods the library implements for all its model (such as downloading or saving, resizing the input + embeddings, pruning heads etc.) + + This model is also a `tf.keras.Model `__ subclass. Use + it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage + and behavior. + + .. note:: + + TF 2.0 models accepts two formats as inputs: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional arguments. + + This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all + the tensors in the first argument of the model call function: :obj:`model(inputs)`. + + If you choose this second option, there are three possibilities you can use to gather all the input Tensors in + the first positional argument : + + - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Args: + config (:class:`~transformers.ConvBertConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model + weights. +""" + +CONVBERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using :class:`~transformers.ConvBertTokenizer`. See + :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for + details. + + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **maked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, + 1]``: + + - 0 corresponds to a `sentence A` token, + - 1 corresponds to a `sentence B` token. + + `What are token type IDs? <../glossary.html#token-type-ids>`__ + position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, + config.max_position_embeddings - 1]``. + + `What are position IDs? <../glossary.html#position-ids>`__ + head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (:obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert :obj:`input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. + training (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +@add_start_docstrings( + "The bare ConvBERT Model transformer outputing raw hidden-states without any specific head on top.", + CONVBERT_START_DOCSTRING, +) +class TFConvBertModel(TFConvBertPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.convbert = TFConvBertMainLayer(config, name="convbert") + + @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="YituTech/conv-bert-base", + output_type=TFBaseModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + training=False, + **kwargs, + ): + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + kwargs_call=kwargs, + ) + outputs = self.convbert( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + token_type_ids=inputs["token_type_ids"], + position_ids=inputs["position_ids"], + head_mask=inputs["head_mask"], + inputs_embeds=inputs["inputs_embeds"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + + return outputs + + def serving_output(self, output): + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + + return TFBaseModelOutput(last_hidden_state=output.last_hidden_state, hidden_states=hs, attentions=attns) + + +class TFConvBertMaskedLMHead(tf.keras.layers.Layer): + def __init__(self, config, input_embeddings, **kwargs): + super().__init__(**kwargs) + + self.vocab_size = config.vocab_size + self.embedding_size = config.embedding_size + self.input_embeddings = input_embeddings + + def build(self, input_shape): + self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") + + super().build(input_shape) + + def get_output_embeddings(self): + return self.input_embeddings + + def set_output_embeddings(self, value): + self.input_embeddings.weight = value + self.input_embeddings.vocab_size = shape_list(value)[0] + + def get_bias(self): + return {"bias": self.bias} + + def set_bias(self, value): + self.bias = value["bias"] + self.vocab_size = shape_list(value["bias"])[0] + + def call(self, hidden_states): + seq_length = shape_list(tensor=hidden_states)[1] + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size]) + hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) + hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) + + return hidden_states + + +class TFConvBertGeneratorPredictions(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dense = tf.keras.layers.Dense(config.embedding_size, name="dense") + + def call(self, generator_hidden_states, training=False): + hidden_states = self.dense(generator_hidden_states) + hidden_states = get_tf_activation("gelu")(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + + return hidden_states + + +@add_start_docstrings("""ConvBERT Model with a `language modeling` head on top. """, CONVBERT_START_DOCSTRING) +class TFConvBertForMaskedLM(TFConvBertPreTrainedModel, TFMaskedLanguageModelingLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, **kwargs) + + self.vocab_size = config.vocab_size + self.convbert = TFConvBertMainLayer(config, name="convbert") + self.generator_predictions = TFConvBertGeneratorPredictions(config, name="generator_predictions") + + if isinstance(config.hidden_act, str): + self.activation = get_tf_activation(config.hidden_act) + else: + self.activation = config.hidden_act + + self.generator_lm_head = TFConvBertMaskedLMHead( + config, self.convbert.embeddings.word_embeddings, name="generator_lm_head" + ) + + def get_lm_head(self): + return self.generator_lm_head + + def get_prefix_bias_name(self): + return self.name + "/" + self.generator_lm_head.name + + @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="YituTech/conv-bert-base", + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + labels=None, + training=False, + **kwargs, + ): + r""" + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., + config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored + (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + """ + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + labels=labels, + training=training, + kwargs_call=kwargs, + ) + generator_hidden_states = self.convbert( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + token_type_ids=inputs["token_type_ids"], + position_ids=inputs["position_ids"], + head_mask=inputs["head_mask"], + inputs_embeds=inputs["inputs_embeds"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + generator_sequence_output = generator_hidden_states[0] + prediction_scores = self.generator_predictions(generator_sequence_output, training=inputs["training"]) + prediction_scores = self.generator_lm_head(prediction_scores, training=inputs["training"]) + loss = None if inputs["labels"] is None else self.compute_loss(inputs["labels"], prediction_scores) + + if not inputs["return_dict"]: + output = (prediction_scores,) + generator_hidden_states[1:] + + return ((loss,) + output) if loss is not None else output + + return TFMaskedLMOutput( + loss=loss, + logits=prediction_scores, + hidden_states=generator_hidden_states.hidden_states, + attentions=generator_hidden_states.attentions, + ) + + # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output + def serving_output(self, output): + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + + return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns) + + +class TFConvBertClassificationHead(tf.keras.layers.Layer): + """Head for sentence-level classification tasks.""" + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.dense = tf.keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.out_proj = tf.keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" + ) + + self.config = config + + def call(self, hidden_states, **kwargs): + x = hidden_states[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x) + x = self.dense(x) + x = get_tf_activation(self.config.hidden_act)(x) + x = self.dropout(x) + x = self.out_proj(x) + + return x + + +@add_start_docstrings( + """ + ConvBERT Model transformer with a sequence classification/regression head on top e.g., for GLUE tasks. + """, + CONVBERT_START_DOCSTRING, +) +class TFConvBertForSequenceClassification(TFConvBertPreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + self.convbert = TFConvBertMainLayer(config, name="convbert") + self.classifier = TFConvBertClassificationHead(config, name="classifier") + + @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="YituTech/conv-bert-base", + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + labels=None, + training=False, + **kwargs, + ): + r""" + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., + config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + labels=labels, + training=training, + kwargs_call=kwargs, + ) + outputs = self.convbert( + inputs["input_ids"], + attention_mask=inputs["attention_mask"], + token_type_ids=inputs["token_type_ids"], + position_ids=inputs["position_ids"], + head_mask=inputs["head_mask"], + inputs_embeds=inputs["inputs_embeds"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + logits = self.classifier(outputs[0], training=inputs["training"]) + loss = None if inputs["labels"] is None else self.compute_loss(inputs["labels"], logits) + + if not inputs["return_dict"]: + output = (logits,) + outputs[1:] + + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def serving_output(self, output): + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + + return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns) + + +@add_start_docstrings( + """ + ConvBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + CONVBERT_START_DOCSTRING, +) +class TFConvBertForMultipleChoice(TFConvBertPreTrainedModel, TFMultipleChoiceLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.convbert = TFConvBertMainLayer(config, name="convbert") + self.sequence_summary = TFSequenceSummary( + config, initializer_range=config.initializer_range, name="sequence_summary" + ) + self.classifier = tf.keras.layers.Dense( + 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + + @property + def dummy_inputs(self): + """ + Dummy inputs to build the network. + + Returns: + tf.Tensor with dummy inputs + """ + return {"input_ids": tf.convert_to_tensor(MULTIPLE_CHOICE_DUMMY_INPUTS)} + + @add_start_docstrings_to_model_forward( + CONVBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") + ) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="YituTech/conv-bert-base", + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + labels=None, + training=False, + **kwargs, + ): + r""" + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., + num_choices]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See + :obj:`input_ids` above) + """ + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + labels=labels, + training=training, + kwargs_call=kwargs, + ) + + if inputs["input_ids"] is not None: + num_choices = shape_list(inputs["input_ids"])[1] + seq_length = shape_list(inputs["input_ids"])[2] + else: + num_choices = shape_list(inputs["inputs_embeds"])[1] + seq_length = shape_list(inputs["inputs_embeds"])[2] + + flat_input_ids = tf.reshape(inputs["input_ids"], (-1, seq_length)) if inputs["input_ids"] is not None else None + flat_attention_mask = ( + tf.reshape(inputs["attention_mask"], (-1, seq_length)) if inputs["attention_mask"] is not None else None + ) + flat_token_type_ids = ( + tf.reshape(inputs["token_type_ids"], (-1, seq_length)) if inputs["token_type_ids"] is not None else None + ) + flat_position_ids = ( + tf.reshape(inputs["position_ids"], (-1, seq_length)) if inputs["position_ids"] is not None else None + ) + flat_inputs_embeds = ( + tf.reshape(inputs["inputs_embeds"], (-1, seq_length, shape_list(inputs["inputs_embeds"])[3])) + if inputs["inputs_embeds"] is not None + else None + ) + outputs = self.convbert( + flat_input_ids, + flat_attention_mask, + flat_token_type_ids, + flat_position_ids, + inputs["head_mask"], + flat_inputs_embeds, + inputs["output_attentions"], + inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + logits = self.sequence_summary(outputs[0], training=inputs["training"]) + logits = self.classifier(logits) + reshaped_logits = tf.reshape(logits, (-1, num_choices)) + loss = None if inputs["labels"] is None else self.compute_loss(inputs["labels"], reshaped_logits) + + if not inputs["return_dict"]: + output = (reshaped_logits,) + outputs[1:] + + return ((loss,) + output) if loss is not None else output + + return TFMultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + @tf.function( + input_signature=[ + { + "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"), + "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"), + "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"), + } + ] + ) + def serving(self, inputs): + output = self.call(inputs) + + return self.serving_output(output) + + def serving_output(self, output): + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + + return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns) + + +@add_start_docstrings( + """ + ConvBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + CONVBERT_START_DOCSTRING, +) +class TFConvBertForTokenClassification(TFConvBertPreTrainedModel, TFTokenClassificationLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + self.convbert = TFConvBertMainLayer(config, name="convbert") + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.classifier = tf.keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + + @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="YituTech/conv-bert-base", + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + labels=None, + training=False, + **kwargs, + ): + r""" + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - + 1]``. + """ + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + labels=labels, + training=training, + kwargs_call=kwargs, + ) + outputs = self.convbert( + inputs["input_ids"], + attention_mask=inputs["attention_mask"], + token_type_ids=inputs["token_type_ids"], + position_ids=inputs["position_ids"], + head_mask=inputs["head_mask"], + inputs_embeds=inputs["inputs_embeds"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + sequence_output = outputs[0] + sequence_output = self.dropout(sequence_output, training=inputs["training"]) + logits = self.classifier(sequence_output) + loss = None if inputs["labels"] is None else self.compute_loss(inputs["labels"], logits) + + if not inputs["return_dict"]: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFTokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def serving_output(self, output): + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + + return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns) + + +@add_start_docstrings( + """ + ConvBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layer on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + CONVBERT_START_DOCSTRING, +) +class TFConvBertForQuestionAnswering(TFConvBertPreTrainedModel, TFQuestionAnsweringLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + self.convbert = TFConvBertMainLayer(config, name="convbert") + self.qa_outputs = tf.keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" + ) + + @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="YituTech/conv-bert-base", + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + start_positions=None, + end_positions=None, + training=False, + **kwargs, + ): + r""" + start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the + sequence are not taken into account for computing the loss. + end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the + sequence are not taken into account for computing the loss. + """ + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + start_positions=start_positions, + end_positions=end_positions, + training=training, + kwargs_call=kwargs, + ) + outputs = self.convbert( + inputs["input_ids"], + attention_mask=inputs["attention_mask"], + token_type_ids=inputs["token_type_ids"], + position_ids=inputs["position_ids"], + head_mask=inputs["head_mask"], + inputs_embeds=inputs["inputs_embeds"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + sequence_output = outputs[0] + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = tf.split(logits, 2, axis=-1) + start_logits = tf.squeeze(start_logits, axis=-1) + end_logits = tf.squeeze(end_logits, axis=-1) + loss = None + + if inputs["start_positions"] is not None and inputs["end_positions"] is not None: + labels = {"start_position": inputs["start_positions"]} + labels["end_position"] = inputs["end_positions"] + loss = self.compute_loss(labels, (start_logits, end_logits)) + + if not inputs["return_dict"]: + output = (start_logits, end_logits) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def serving_output(self, output): + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + + return TFQuestionAnsweringModelOutput( + start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns + ) diff --git a/src/transformers/models/convbert/tokenization_convbert.py b/src/transformers/models/convbert/tokenization_convbert.py new file mode 100644 index 0000000000..12ee66ed28 --- /dev/null +++ b/src/transformers/models/convbert/tokenization_convbert.py @@ -0,0 +1,56 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for ConvBERT.""" +from ...utils import logging +from ..bert.tokenization_bert import BertTokenizer + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "YituTech/conv-bert-base": "https://huggingface.co/YituTech/conv-bert-base/resolve/main/vocab.txt", + "YituTech/conv-bert-medium-small": "https://huggingface.co/YituTech/conv-bert-medium-small/resolve/main/vocab.txt", + "YituTech/conv-bert-small": "https://huggingface.co/YituTech/conv-bert-small/resolve/main/vocab.txt", + } +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "YituTech/conv-bert-base": 512, + "YituTech/conv-bert-medium-small": 512, + "YituTech/conv-bert-small": 512, +} + + +PRETRAINED_INIT_CONFIGURATION = { + "YituTech/conv-bert-base": {"do_lower_case": True}, + "YituTech/conv-bert-medium-small": {"do_lower_case": True}, + "YituTech/conv-bert-small": {"do_lower_case": True}, +} + + +class ConvBertTokenizer(BertTokenizer): + r""" + Construct a ConvBERT tokenizer. :class:`~transformers.ConvBertTokenizer` is identical to + :class:`~transformers.BertTokenizer` and runs end-to-end tokenization: punctuation splitting and wordpiece. Refer + to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning parameters. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION diff --git a/src/transformers/models/convbert/tokenization_convbert_fast.py b/src/transformers/models/convbert/tokenization_convbert_fast.py new file mode 100644 index 0000000000..4bc4c05234 --- /dev/null +++ b/src/transformers/models/convbert/tokenization_convbert_fast.py @@ -0,0 +1,61 @@ +# coding=utf-8 +# Copyright The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for ConvBERT.""" +from ...utils import logging +from ..bert.tokenization_bert_fast import BertTokenizerFast +from .tokenization_convbert import ConvBertTokenizer + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "YituTech/conv-bert-base": "https://huggingface.co/YituTech/conv-bert-base/resolve/main/vocab.txt", + "YituTech/conv-bert-medium-small": "https://huggingface.co/YituTech/conv-bert-medium-small/resolve/main/vocab.txt", + "YituTech/conv-bert-small": "https://huggingface.co/YituTech/conv-bert-small/resolve/main/vocab.txt", + } +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "YituTech/conv-bert-base": 512, + "YituTech/conv-bert-medium-small": 512, + "YituTech/conv-bert-small": 512, +} + + +PRETRAINED_INIT_CONFIGURATION = { + "YituTech/conv-bert-base": {"do_lower_case": True}, + "YituTech/conv-bert-medium-small": {"do_lower_case": True}, + "YituTech/conv-bert-small": {"do_lower_case": True}, +} + + +class ConvBertTokenizerFast(BertTokenizerFast): + r""" + Construct a "fast" ConvBERT tokenizer (backed by HuggingFace's `tokenizers` library). + + :class:`~transformers.ConvBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs + end-to-end tokenization: punctuation splitting and wordpiece. + + Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning + parameters. + """ + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + slow_tokenizer_class = ConvBertTokenizer diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 69d0024784..2ebf9ab717 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -697,6 +697,81 @@ class CamembertModel: requires_pytorch(self) +CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class ConvBertForMaskedLM: + def __init__(self, *args, **kwargs): + requires_pytorch(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_pytorch(self) + + +class ConvBertForMultipleChoice: + def __init__(self, *args, **kwargs): + requires_pytorch(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_pytorch(self) + + +class ConvBertForQuestionAnswering: + def __init__(self, *args, **kwargs): + requires_pytorch(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_pytorch(self) + + +class ConvBertForSequenceClassification: + def __init__(self, *args, **kwargs): + requires_pytorch(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_pytorch(self) + + +class ConvBertForTokenClassification: + def __init__(self, *args, **kwargs): + requires_pytorch(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_pytorch(self) + + +class ConvBertLayer: + def __init__(self, *args, **kwargs): + requires_pytorch(self) + + +class ConvBertModel: + def __init__(self, *args, **kwargs): + requires_pytorch(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_pytorch(self) + + +class ConvBertPreTrainedModel: + def __init__(self, *args, **kwargs): + requires_pytorch(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_pytorch(self) + + +def load_tf_weights_in_convbert(*args, **kwargs): + requires_pytorch(load_tf_weights_in_convbert) + + CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py index 1413ae2cce..838a9293fd 100644 --- a/src/transformers/utils/dummy_tf_objects.py +++ b/src/transformers/utils/dummy_tf_objects.py @@ -453,6 +453,77 @@ class TFCamembertModel: requires_tf(self) +TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class TFConvBertForMaskedLM: + def __init__(self, *args, **kwargs): + requires_tf(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tf(self) + + +class TFConvBertForMultipleChoice: + def __init__(self, *args, **kwargs): + requires_tf(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tf(self) + + +class TFConvBertForQuestionAnswering: + def __init__(self, *args, **kwargs): + requires_tf(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tf(self) + + +class TFConvBertForSequenceClassification: + def __init__(self, *args, **kwargs): + requires_tf(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tf(self) + + +class TFConvBertForTokenClassification: + def __init__(self, *args, **kwargs): + requires_tf(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tf(self) + + +class TFConvBertLayer: + def __init__(self, *args, **kwargs): + requires_tf(self) + + +class TFConvBertModel: + def __init__(self, *args, **kwargs): + requires_tf(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tf(self) + + +class TFConvBertPreTrainedModel: + def __init__(self, *args, **kwargs): + requires_tf(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tf(self) + + TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/src/transformers/utils/dummy_tokenizers_objects.py b/src/transformers/utils/dummy_tokenizers_objects.py index 1fa8a63fc7..543cabc780 100644 --- a/src/transformers/utils/dummy_tokenizers_objects.py +++ b/src/transformers/utils/dummy_tokenizers_objects.py @@ -47,6 +47,15 @@ class CamembertTokenizerFast: requires_tokenizers(self) +class ConvBertTokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + class DistilBertTokenizerFast: def __init__(self, *args, **kwargs): requires_tokenizers(self) diff --git a/tests/test_modeling_convbert.py b/tests/test_modeling_convbert.py new file mode 100644 index 0000000000..e561245bd6 --- /dev/null +++ b/tests/test_modeling_convbert.py @@ -0,0 +1,433 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the PyTorch ConvBERT model. """ + + +import unittest + +from tests.test_modeling_common import floats_tensor +from transformers import is_torch_available +from transformers.testing_utils import require_torch, slow, torch_device + +from .test_configuration_common import ConfigTester +from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask + + +if is_torch_available(): + import torch + + from transformers import ( + MODEL_FOR_QUESTION_ANSWERING_MAPPING, + ConvBertConfig, + ConvBertForMaskedLM, + ConvBertForMultipleChoice, + ConvBertForQuestionAnswering, + ConvBertForSequenceClassification, + ConvBertForTokenClassification, + ConvBertModel, + ) + from transformers.models.convbert.modeling_convbert import CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST + + +class ConvBertModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = ConvBertConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + is_decoder=False, + initializer_range=self.initializer_range, + ) + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def prepare_config_and_inputs_for_decoder(self): + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = self.prepare_config_and_inputs() + + config.is_decoder = True + encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size]) + encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + return ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = ConvBertModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_for_masked_lm( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = ConvBertForMaskedLM(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_for_question_answering( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = ConvBertForQuestionAnswering(config=config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + start_positions=sequence_labels, + end_positions=sequence_labels, + ) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def create_and_check_for_sequence_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = ConvBertForSequenceClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_for_token_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = ConvBertForTokenClassification(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_for_multiple_choice( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_choices = self.num_choices + model = ConvBertForMultipleChoice(config=config) + model.to(torch_device) + model.eval() + multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + result = model( + multiple_choice_inputs_ids, + attention_mask=multiple_choice_input_mask, + token_type_ids=multiple_choice_token_type_ids, + labels=choice_labels, + ) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class ConvBertModelTest(ModelTesterMixin, unittest.TestCase): + + all_model_classes = ( + ( + ConvBertModel, + ConvBertForMaskedLM, + ConvBertForMultipleChoice, + ConvBertForQuestionAnswering, + ConvBertForSequenceClassification, + ConvBertForTokenClassification, + ) + if is_torch_available() + else () + ) + test_pruning = False + test_head_masking = False + + def setUp(self): + self.model_tester = ConvBertModelTester(self) + self.config_tester = ConfigTester(self, config_class=ConvBertConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) + + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_question_answering(*config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_token_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + for model_name in CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = ConvBertModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + seq_len = getattr(self.model_tester, "seq_length", None) + decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len) + encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len) + decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length) + encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) + chunk_length = getattr(self.model_tester, "chunk_length", None) + if chunk_length is not None and hasattr(self.model_tester, "num_hashes"): + encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + if chunk_length is not None: + self.assertListEqual( + list(attentions[0].shape[-4:]), + [self.model_tester.num_attention_heads / 2, encoder_seq_length, chunk_length, encoder_key_length], + ) + else: + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads / 2, encoder_seq_length, encoder_key_length], + ) + out_len = len(outputs) + + if self.is_encoder_decoder: + correct_outlen = 5 + + # loss is at first position + if "labels" in inputs_dict: + correct_outlen += 1 # loss is added to beginning + # Question Answering model returns start_logits and end_logits + if model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values(): + correct_outlen += 1 # start_logits and end_logits instead of only 1 output + if "past_key_values" in outputs: + correct_outlen += 1 # past_key_values have been returned + + self.assertEqual(out_len, correct_outlen) + + # decoder attentions + decoder_attentions = outputs.decoder_attentions + self.assertIsInstance(decoder_attentions, (list, tuple)) + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(decoder_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length], + ) + + # cross attentions + cross_attentions = outputs.cross_attentions + self.assertIsInstance(cross_attentions, (list, tuple)) + self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(cross_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + decoder_seq_length, + encoder_key_length, + ], + ) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if hasattr(self.model_tester, "num_hidden_states_types"): + added_hidden_states = self.model_tester.num_hidden_states_types + elif self.is_encoder_decoder: + added_hidden_states = 2 + else: + added_hidden_states = 1 + self.assertEqual(out_len + added_hidden_states, len(outputs)) + + self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + if chunk_length is not None: + self.assertListEqual( + list(self_attentions[0].shape[-4:]), + [self.model_tester.num_attention_heads / 2, encoder_seq_length, chunk_length, encoder_key_length], + ) + else: + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads / 2, encoder_seq_length, encoder_key_length], + ) + + +@require_torch +class ConvBertModelIntegrationTest(unittest.TestCase): + @slow + def test_inference_masked_lm(self): + model = ConvBertModel.from_pretrained("YituTech/conv-bert-base") + input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]]) + output = model(input_ids)[0] + print(output[:, :3, :3]) + + expected_shape = torch.Size((1, 6, 768)) + self.assertEqual(output.shape, expected_shape) + + # TODO Replace values below with what was printed above. + expected_slice = torch.tensor( + [[[-0.0348, -0.4686, -0.3064], [0.2264, -0.2699, -0.7423], [0.1032, -0.4501, -0.5828]]] + ) + + self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4)) diff --git a/tests/test_modeling_tf_convbert.py b/tests/test_modeling_tf_convbert.py new file mode 100644 index 0000000000..5f61b757b3 --- /dev/null +++ b/tests/test_modeling_tf_convbert.py @@ -0,0 +1,394 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import tempfile +import unittest + +from transformers import ConvBertConfig, is_tf_available +from transformers.testing_utils import require_tf, slow + +from .test_configuration_common import ConfigTester +from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor + + +if is_tf_available(): + import tensorflow as tf + + from transformers import ( + TFConvBertForMaskedLM, + TFConvBertForMultipleChoice, + TFConvBertForQuestionAnswering, + TFConvBertForSequenceClassification, + TFConvBertForTokenClassification, + TFConvBertModel, + ) + + +class TFConvBertModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = 13 + self.seq_length = 7 + self.is_training = True + self.use_input_mask = True + self.use_token_type_ids = True + self.use_labels = True + self.vocab_size = 99 + self.hidden_size = 384 + self.num_hidden_layers = 5 + self.num_attention_heads = 4 + self.intermediate_size = 37 + self.hidden_act = "gelu" + self.hidden_dropout_prob = 0.1 + self.attention_probs_dropout_prob = 0.1 + self.max_position_embeddings = 512 + self.type_vocab_size = 16 + self.type_sequence_label_size = 2 + self.initializer_range = 0.02 + self.num_labels = 3 + self.num_choices = 4 + self.embedding_size = 128 + self.head_ratio = 2 + self.conv_kernel_size = 9 + self.num_groups = 1 + self.scope = None + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = ConvBertConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range, + return_dict=True, + ) + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TFConvBertModel(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + + inputs = [input_ids, input_mask] + result = model(inputs) + + result = model(input_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_for_masked_lm( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TFConvBertForMaskedLM(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_for_sequence_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = TFConvBertForSequenceClassification(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_for_multiple_choice( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_choices = self.num_choices + model = TFConvBertForMultipleChoice(config=config) + multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1)) + multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1)) + multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1)) + inputs = { + "input_ids": multiple_choice_inputs_ids, + "attention_mask": multiple_choice_input_mask, + "token_type_ids": multiple_choice_token_type_ids, + } + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) + + def create_and_check_for_token_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = TFConvBertForTokenClassification(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_for_question_answering( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TFConvBertForQuestionAnswering(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + + result = model(inputs) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_tf +class TFConvBertModelTest(TFModelTesterMixin, unittest.TestCase): + + all_model_classes = ( + ( + TFConvBertModel, + TFConvBertForMaskedLM, + TFConvBertForQuestionAnswering, + TFConvBertForSequenceClassification, + TFConvBertForTokenClassification, + TFConvBertForMultipleChoice, + ) + if is_tf_available() + else () + ) + test_pruning = False + test_head_masking = False + + def setUp(self): + self.model_tester = TFConvBertModelTester(self) + self.config_tester = ConfigTester(self, config_class=ConvBertConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) + + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_question_answering(*config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_token_classification(*config_and_inputs) + + @slow + def test_saved_model_with_attentions_output(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.output_attentions = True + config.output_hidden_states = False + + if hasattr(config, "use_cache"): + config.use_cache = False + + encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length) + encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) + + for model_class in self.all_model_classes: + class_inputs_dict = self._prepare_for_class(inputs_dict, model_class) + model = model_class(config) + num_out = len(model(class_inputs_dict)) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname, saved_model=True) + model = tf.keras.models.load_model(os.path.join(tmpdirname, "saved_model", "1")) + outputs = model(class_inputs_dict) + output = outputs["attentions"] + + self.assertEqual(len(outputs), num_out) + self.assertEqual(len(output), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(output[0].shape[-3:]), + [self.model_tester.num_attention_heads / 2, encoder_seq_length, encoder_key_length], + ) + + @slow + def test_model_from_pretrained(self): + model = TFConvBertModel.from_pretrained("YituTech/conv-bert-base") + self.assertIsNotNone(model) + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", self.model_tester.seq_length) + encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length) + decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length) + encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) + + def check_decoder_attentions_output(outputs): + out_len = len(outputs) + self.assertEqual(out_len % 2, 0) + decoder_attentions = outputs.decoder_attentions + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(decoder_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads / 2, decoder_seq_length, decoder_key_length], + ) + + def check_encoder_attentions_output(outputs): + attentions = [ + t.numpy() for t in (outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions) + ] + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads / 2, encoder_seq_length, encoder_key_length], + ) + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["use_cache"] = False + config.output_hidden_states = False + model = model_class(config) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) + out_len = len(outputs) + self.assertEqual(config.output_hidden_states, False) + check_encoder_attentions_output(outputs) + + if self.is_encoder_decoder: + model = model_class(config) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) + self.assertEqual(config.output_hidden_states, False) + check_decoder_attentions_output(outputs) + + # Check that output attentions can also be changed via the config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) + self.assertEqual(config.output_hidden_states, False) + check_encoder_attentions_output(outputs) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + config.output_hidden_states = True + model = model_class(config) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) + + self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs)) + self.assertEqual(model.config.output_hidden_states, True) + check_encoder_attentions_output(outputs) + + +@require_tf +class TFConvBertModelIntegrationTest(unittest.TestCase): + @slow + def test_inference_masked_lm(self): + model = TFConvBertModel.from_pretrained("YituTech/conv-bert-base") + input_ids = tf.constant([[0, 1, 2, 3, 4, 5]]) + output = model(input_ids)[0] + + expected_shape = [1, 6, 768] + self.assertEqual(output.shape, expected_shape) + + print(output[:, :3, :3]) + + expected_slice = tf.constant( + [ + [ + [-0.03475493, -0.4686034, -0.30638832], + [0.22637248, -0.26988646, -0.7423424], + [0.10324868, -0.45013508, -0.58280784], + ] + ] + ) + tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-4) diff --git a/utils/check_repo.py b/utils/check_repo.py index e40500e06b..54f23a5d08 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -52,6 +52,7 @@ IGNORE_NON_TESTED = [ "TFDPRSpanPredictor", # Building part of bigger (tested) model. "TFElectraMainLayer", # Building part of bigger (tested) model (should it be a TFPreTrainedModel ?) "TFRobertaForMultipleChoice", # TODO: fix + "SeparableConv1D", # Building part of bigger (tested) model. ] # Update this list with test files that don't have a tester with a `all_model_classes` variable and which don't @@ -116,6 +117,7 @@ IGNORE_NON_AUTO_CONFIGURED = [ "XLMProphetNetDecoder", "XLMProphetNetEncoder", "XLNetForQuestionAnswering", + "SeparableConv1D", ] # This is to make sure the transformers module imported is the one in the repo.