From 2422cda01b6165f9d67f14a76391520c6def2f18 Mon Sep 17 00:00:00 2001
From: Weizhen <514269477@qq.com>
Date: Mon, 19 Oct 2020 23:36:09 +0800
Subject: [PATCH] ProphetNet (#7157)

* add new model prophetnet

prophetnet modified

modify codes as suggested v1

add prophetnet test files

* still bugs, because of changed output formats of encoder and decoder

* move prophetnet into the latest version

* clean integration tests

* clean tokenizers

* add xlm config to init

* correct typo in init

* further refactoring

* continue refactor

* save parallel

* add decoder_attention_mask

* fix use_cache vs. past_key_values

* fix common tests

* change decoder output logits

* fix xlm tests

* make common tests pass

* change model architecture

* add tokenizer tests

* finalize model structure

* no weight mapping

* correct n-gram stream attention mask as discussed with qweizhen

* remove unused import

* fix index.rst

* fix tests

* delete unnecessary code

* add fast integration test

* rename weights

* final weight remapping

* save intermediate

* Descriptions for Prophetnet Config File

* finish all models

* finish new model outputs

* delete unnecessary files

* refactor encoder layer

* add dummy docs

* code quality

* fix tests

* add model pages to doctree

* further refactor

* more refactor, more tests

* finish code refactor and tests

* remove unnecessary files

* further clean up

* add docstring template

* finish tokenizer doc

* finish prophetnet

* fix copies

* fix typos

* fix tf tests

* fix fp16

* fix tf test 2nd try

* fix code quality

* add test for each model

* merge new tests to branch

* Update model_cards/microsoft/prophetnet-large-uncased-cnndm/README.md

Co-authored-by: Sam Shleifer <sshleifer@gmail.com>

* Update model_cards/microsoft/prophetnet-large-uncased-cnndm/README.md

Co-authored-by: Sam Shleifer <sshleifer@gmail.com>

* Update src/transformers/modeling_prophetnet.py

Co-authored-by: Sam Shleifer <sshleifer@gmail.com>

* Update utils/check_repo.py

Co-authored-by: Sam Shleifer <sshleifer@gmail.com>

* apply sams and sylvains comments

* make style

* remove unnecessary code

* Update README.md

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update README.md

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/configuration_prophetnet.py

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>

* implement lysandres comments

* correct docs

* fix isort

* fix tokenizers

* fix copies

Co-authored-by: weizhen <weizhen@mail.ustc.edu.cn>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Sam Shleifer <sshleifer@gmail.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
---
 README.md                                     |    2 +
 docs/source/index.rst                         |   28 +-
 docs/source/model_doc/encoderdecoder.rst      |    2 +-
 docs/source/model_doc/prophetnet.rst          |   83 +
 docs/source/model_doc/xlmprophetnet.rst       |   63 +
 .../prophetnet-large-uncased-cnndm/README.md  |   29 +
 .../prophetnet-large-uncased/README.md        |   17 +
 .../README.md                                 |   32 +
 .../README.md                                 |   31 +
 .../xprophetnet-large-wiki100-cased/README.md |   19 +
 src/transformers/__init__.py                  |   22 +-
 src/transformers/configuration_auto.py        |    8 +
 src/transformers/configuration_lxmert.py      |    5 +-
 src/transformers/configuration_prophetnet.py  |  167 ++
 .../configuration_xlm_prophetnet.py           |   35 +
 ..._original_pytorch_checkpoint_to_pytorch.py |  162 ++
 src/transformers/modeling_auto.py             |   14 +
 src/transformers/modeling_lxmert.py           |    4 +-
 src/transformers/modeling_prophetnet.py       | 2052 +++++++++++++++++
 src/transformers/modeling_tf_lxmert.py        |    4 +-
 src/transformers/modeling_utils.py            |   11 +-
 src/transformers/modeling_xlm_prophetnet.py   |  159 ++
 src/transformers/tokenization_auto.py         |    6 +
 src/transformers/tokenization_deberta.py      |    4 +-
 src/transformers/tokenization_fsmt.py         |    4 +-
 src/transformers/tokenization_prophetnet.py   |  284 +++
 .../tokenization_xlm_prophetnet.py            |  307 +++
 src/transformers/tokenization_xlm_roberta.py  |    3 +-
 src/transformers/utils/dummy_pt_objects.py    |   81 +
 .../utils/dummy_sentencepiece_objects.py      |    9 +
 tests/test_modeling_common.py                 |   37 +-
 tests/test_modeling_encoder_decoder.py        |  114 +-
 tests/test_modeling_prophetnet.py             | 1075 +++++++++
 tests/test_modeling_t5.py                     |    1 -
 tests/test_modeling_xlm_prophetnet.py         |  141 ++
 tests/test_tokenization_prophetnet.py         |  191 ++
 tests/test_tokenization_xlm_prophetnet.py     |  125 +
 utils/check_repo.py                           |    5 +
 38 files changed, 5288 insertions(+), 48 deletions(-)
 create mode 100644 docs/source/model_doc/prophetnet.rst
 create mode 100644 docs/source/model_doc/xlmprophetnet.rst
 create mode 100644 model_cards/microsoft/prophetnet-large-uncased-cnndm/README.md
 create mode 100644 model_cards/microsoft/prophetnet-large-uncased/README.md
 create mode 100644 model_cards/microsoft/xprophetnet-large-wiki100-cased-xglue-ntg/README.md
 create mode 100644 model_cards/microsoft/xprophetnet-large-wiki100-cased-xglue-qg/README.md
 create mode 100644 model_cards/microsoft/xprophetnet-large-wiki100-cased/README.md
 create mode 100644 src/transformers/configuration_prophetnet.py
 create mode 100644 src/transformers/configuration_xlm_prophetnet.py
 create mode 100644 src/transformers/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
 create mode 100644 src/transformers/modeling_prophetnet.py
 create mode 100644 src/transformers/modeling_xlm_prophetnet.py
 create mode 100644 src/transformers/tokenization_prophetnet.py
 create mode 100644 src/transformers/tokenization_xlm_prophetnet.py
 create mode 100644 tests/test_modeling_prophetnet.py
 create mode 100644 tests/test_modeling_xlm_prophetnet.py
 create mode 100644 tests/test_tokenization_prophetnet.py
 create mode 100644 tests/test_tokenization_xlm_prophetnet.py

diff --git a/README.md b/README.md
index 9f9600b4dc..bc6f1c30a7 100644
--- a/README.md
+++ b/README.md
@@ -182,6 +182,7 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper  [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+1. **[ProphetNet](https://huggingface.co/transformers/master/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
 1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
 ultilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
@@ -189,6 +190,7 @@ ultilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/
 1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[XLM](https://huggingface.co/transformers/model_doc/xlm.html)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+1. **[XLM-ProphetNet](https://huggingface.co/transformers/master/model_doc/xlmprophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[XLM-RoBERTa](https://huggingface.co/transformers/model_doc/xlmroberta.html)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
 1. **[XLNet](https://huggingface.co/transformers/model_doc/xlnet.html)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 1. **[Other community models](https://huggingface.co/models)**, contributed by the [community](https://huggingface.co/users).
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 2c23377419..4718b524fe 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -129,32 +129,40 @@ conversion utilities for the following models:
 22. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
     Gap-sentences for Abstractive Summarization <https://arxiv.org/abs/1912.08777>`__> by Jingqing Zhang, Yao Zhao,
     Mohammad Saleh and Peter J. Liu.
-23. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
+23. `ProphetNet <https://huggingface.co/transformers/master/model_doc/prophetnet.html>`__ (from Microsoft Research)
+    released with the paper `ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training
+    <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen,
+    Ruofei Zhang and Ming Zhou.
+24. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
     Transformer <https://arxiv.org/abs/2001.04451>`__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-24. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
+25. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
     Pretraining Approach <https://arxiv.org/abs/1907.11692>`__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar
     Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. ultilingual BERT into `DistilmBERT
     <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__ and a German version of
     DistilBERT.
-25. `SqueezeBert <https://huggingface.co/transformers/master/model_doc/squeezebert.html>`__ released with the paper
+26. `SqueezeBert <https://huggingface.co/transformers/master/model_doc/squeezebert.html>`__ released with the paper
     `SqueezeBERT: What can computer vision teach NLP about efficient neural networks?
     <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-26. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
+27. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
     Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
     Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-27. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
+28. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
     Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__ by Zihang Dai*,
     Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-28. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
+29. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
     Pretraining <https://arxiv.org/abs/1901.07291>`__ by Guillaume Lample and Alexis Conneau.
-29. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
+30. `XLM-ProphetNet <https://huggingface.co/transformers/master/model_doc/xlmprophetnet.html>`__ (from Microsoft
+    Research) released with the paper `ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training
+    <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen,
+    Ruofei Zhang and Ming Zhou.
+31. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
     Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__ by Alexis Conneau*, Kartikay
     Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke
     Zettlemoyer and Veselin Stoyanov.
-30. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive
+32. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive
     Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`__ by Zhilin Yang*, Zihang Dai*, Yiming
     Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-31. `Other community models <https://huggingface.co/models>`__, contributed by the `community
+33. `Other community models <https://huggingface.co/models>`__, contributed by the `community
     <https://huggingface.co/users>`__.
 
 .. toctree::
@@ -245,6 +253,7 @@ conversion utilities for the following models:
     model_doc/gpt
     model_doc/gpt2
     model_doc/pegasus
+    model_doc/prophetnet
     model_doc/rag
     model_doc/reformer
     model_doc/retribert
@@ -253,6 +262,7 @@ conversion utilities for the following models:
     model_doc/t5
     model_doc/transformerxl
     model_doc/xlm
+    model_doc/xlmprophetnet
     model_doc/xlmroberta
     model_doc/xlnet
 
diff --git a/docs/source/model_doc/encoderdecoder.rst b/docs/source/model_doc/encoderdecoder.rst
index a32b6b1350..441c09483e 100644
--- a/docs/source/model_doc/encoderdecoder.rst
+++ b/docs/source/model_doc/encoderdecoder.rst
@@ -27,4 +27,4 @@ EncoderDecoderModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.EncoderDecoderModel
-    :members: forward
+    :members: forward, from_encoder_decoder_pretrained
diff --git a/docs/source/model_doc/prophetnet.rst b/docs/source/model_doc/prophetnet.rst
new file mode 100644
index 0000000000..079dfb0c51
--- /dev/null
+++ b/docs/source/model_doc/prophetnet.rst
@@ -0,0 +1,83 @@
+ProphetNet
+-----------------------------------------------------------------------------------------------------------------------
+
+**DISCLAIMER:** If you see something strange, file a `Github Issue
+<https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
+@patrickvonplaten
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ProphetNet model was proposed in `ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training, <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang, Ming Zhou on 13 Jan, 2020.
+
+ProphetNet is an encoder-decoder model and can predict n-future tokens for "ngram" language modeling instead of just the next token.
+
+The abstract from the paper is the following:
+
+*In this paper, we present a new sequence-to-sequence pre-training model called ProphetNet, which introduces a novel self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time step. The future n-gram prediction explicitly encourages the model to plan for the future tokens and prevent overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new state-of-the-art results on all these datasets compared to the models using the same scale pre-training corpus.*
+
+The Authors' code can be found `here <https://github.com/microsoft/ProphetNet>`__.
+
+
+ProphetNetConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ProphetNetConfig
+    :members:
+
+
+ProphetNetTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ProphetNetTokenizer
+    :members:
+
+
+ProphetNet specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_prophetnet.ProphetNetSeq2SeqLMOutput
+    :members:
+
+.. autoclass:: transformers.modeling_prophetnet.ProphetNetSeq2SeqModelOutput
+    :members:
+
+.. autoclass:: transformers.modeling_prophetnet.ProphetNetDecoderModelOutput
+    :members:
+
+.. autoclass:: transformers.modeling_prophetnet.ProphetNetDecoderLMOutput
+    :members:
+
+ProphetNetModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ProphetNetModel
+    :members: forward
+
+
+ProphetNetEncoder
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ProphetNetEncoder
+    :members: forward
+
+
+ProphetNetDecoder
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ProphetNetDecoder
+    :members: forward
+
+
+ProphetNetForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ProphetNetForConditionalGeneration
+    :members: forward
+
+
+ProphetNetForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ProphetNetForCausalLM
+    :members: forward
diff --git a/docs/source/model_doc/xlmprophetnet.rst b/docs/source/model_doc/xlmprophetnet.rst
new file mode 100644
index 0000000000..f51b2f390d
--- /dev/null
+++ b/docs/source/model_doc/xlmprophetnet.rst
@@ -0,0 +1,63 @@
+XLM-ProphetNet
+-----------------------------------------------------------------------------------------------------------------------
+
+**DISCLAIMER:** If you see something strange, file a `Github Issue
+<https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
+@patrickvonplaten
+
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The XLM-ProphetNet model was proposed in `ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training, <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang, Ming Zhou on 13 Jan, 2020.
+
+XLM-ProphetNet is an encoder-decoder model and can predict n-future tokens for "ngram" language modeling instead of just the next token. Its architecture is identical to ProhpetNet, but the model was trained on the multi-lingual "wiki100" Wikipedia dump.
+
+The abstract from the paper is the following:
+
+*In this paper, we present a new sequence-to-sequence pre-training model called ProphetNet, which introduces a novel self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time step. The future n-gram prediction explicitly encourages the model to plan for the future tokens and prevent overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new state-of-the-art results on all these datasets compared to the models using the same scale pre-training corpus.*
+
+The Authors' code can be found `here <https://github.com/microsoft/ProphetNet>`__.
+
+XLMProphetNetConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMProphetNetConfig
+    :members:
+
+
+XLMProphetNetTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMProphetNetTokenizer
+    :members:
+
+
+XLMProphetNetModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMProphetNetModel
+
+
+XLMProphetNetEncoder
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMProphetNetEncoder
+
+
+XLMProphetNetDecoder
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMProphetNetDecoder
+
+
+XLMProphetNetForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMProphetNetForConditionalGeneration
+
+
+XLMProphetNetForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMProphetNetForCausalLM
diff --git a/model_cards/microsoft/prophetnet-large-uncased-cnndm/README.md b/model_cards/microsoft/prophetnet-large-uncased-cnndm/README.md
new file mode 100644
index 0000000000..094dbf4020
--- /dev/null
+++ b/model_cards/microsoft/prophetnet-large-uncased-cnndm/README.md
@@ -0,0 +1,29 @@
+## prophetnet-large-uncased-cnndm
+Fine-tuned weights(converted from [original fairseq version repo](https://github.com/microsoft/ProphetNet)) for [ProphetNet](https://arxiv.org/abs/2001.04063) on summarization task CNN/DailyMail.  
+ProphetNet is a new pre-trained language model for sequence-to-sequence learning with a novel self-supervised objective called future n-gram prediction.  
+ProphetNet is able to predict more future tokens with a n-stream decoder. The original implementation is Fairseq version at [github repo](https://github.com/microsoft/ProphetNet).   
+
+### Usage
+```
+from transformers import ProphetNetTokenizer, ProphetNetForConditionalGeneration, ProphetNetConfig
+
+model = ProphetNetForConditionalGeneration.from_pretrained('microsoft/prophetnet-large-uncased-cnndm')
+tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased-cnndm')
+
+ARTICLE_TO_SUMMARIZE = "USTC was founded in Beijing by the Chinese Academy of Sciences (CAS) in September 1958. The Director of CAS, Mr. Guo Moruo was appointed the first president of USTC. USTC's founding mission was to develop a high-level science and technology workforce, as deemed critical for development of China's economy, defense, and science and technology education. The establishment was hailed as \"A Major Event in the History of Chinese Education and Science.\" CAS has supported USTC by combining most of its institutes with the departments of the university. USTC is listed in the top 16 national key universities, becoming the youngest national key university.".lower()
+inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=100, return_tensors='pt')
+
+# Generate Summary
+summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=512, early_stopping=True)
+tokenizer.batch_decode(summary_ids.tolist())
+```
+Here, [X_SEP] is used as a special token to seperate sentences.
+### Citation
+```bibtex
+@article{yan2020prophetnet,
+  title={Prophetnet: Predicting future n-gram for sequence-to-sequence pre-training},
+  author={Yan, Yu and Qi, Weizhen and Gong, Yeyun and Liu, Dayiheng and Duan, Nan and Chen, Jiusheng and Zhang, Ruofei and Zhou, Ming},
+  journal={arXiv preprint arXiv:2001.04063},
+  year={2020}
+}
+```
diff --git a/model_cards/microsoft/prophetnet-large-uncased/README.md b/model_cards/microsoft/prophetnet-large-uncased/README.md
new file mode 100644
index 0000000000..1f75372a15
--- /dev/null
+++ b/model_cards/microsoft/prophetnet-large-uncased/README.md
@@ -0,0 +1,17 @@
+## prophetnet-large-uncased
+Pretrained weights for [ProphetNet](https://arxiv.org/abs/2001.04063).  
+ProphetNet is a new pre-trained language model for sequence-to-sequence learning with a novel self-supervised objective called future n-gram prediction.  
+ProphetNet is able to predict more future tokens with a n-stream decoder. The original implementation is Fairseq version at [github repo](https://github.com/microsoft/ProphetNet).   
+
+### Usage
+Please see [the official repository](https://github.com/microsoft/ProphetNet) for details.
+
+### Citation
+```bibtex
+@article{yan2020prophetnet,
+  title={Prophetnet: Predicting future n-gram for sequence-to-sequence pre-training},
+  author={Yan, Yu and Qi, Weizhen and Gong, Yeyun and Liu, Dayiheng and Duan, Nan and Chen, Jiusheng and Zhang, Ruofei and Zhou, Ming},
+  journal={arXiv preprint arXiv:2001.04063},
+  year={2020}
+}
+```
diff --git a/model_cards/microsoft/xprophetnet-large-wiki100-cased-xglue-ntg/README.md b/model_cards/microsoft/xprophetnet-large-wiki100-cased-xglue-ntg/README.md
new file mode 100644
index 0000000000..c7d30a6987
--- /dev/null
+++ b/model_cards/microsoft/xprophetnet-large-wiki100-cased-xglue-ntg/README.md
@@ -0,0 +1,32 @@
+## xprophetnet-large-wiki100-cased-xglue-ntg
+Cross-lingual version [ProphetNet](https://arxiv.org/abs/2001.04063), pretrained on [wiki100 xGLUE dataset](https://arxiv.org/abs/2004.01401) and finetuned on xGLUE cross-lingual News Titles Generation task.  
+ProphetNet is a new pre-trained language model for sequence-to-sequence learning with a novel self-supervised objective called future n-gram prediction.  
+ProphetNet is able to predict more future tokens with a n-stream decoder. The original implementation is Fairseq version at [github repo](https://github.com/microsoft/ProphetNet).   
+
+xProphetNet is also served as the baseline model for xGLUE cross-lingual natural language generation tasks.  
+For xGLUE corss-lingual NLG tasks, xProphetNet is finetuned with English data, but inference with both English and other zero-shot language data.  
+### Usage
+A quick usage is like: 
+```
+from transformers import ProphetNetTokenizer, ProphetNetForConditionalGeneration, ProphetNetConfig
+
+model = ProphetNetForConditionalGeneration.from_pretrained('microsoft/xprophetnet-large-wiki100-cased-xglue-ntg')
+tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased-xglue-ntg')
+
+EN_SENTENCE = "Microsoft Corporation intends to officially end free support for the Windows 7 operating system after January 14, 2020, according to the official portal of the organization. From that day, users of this system will not be able to receive security updates, which could make their computers vulnerable to cyber attacks."
+RU_SENTENCE = "орпорация Microsoft намерена официально прекратить бесплатную поддержку операционной системы Windows 7 после 14 января 2020 года, сообщается на официальном портале организации . С указанного дня пользователи этой системы не смогут получать обновления безопасности, из-за чего их компьютеры могут стать уязвимыми к кибератакам."
+ZH_SENTENCE = "根据该组织的官方门户网站，微软公司打算在2020年1月14日之后正式终止对Windows 7操作系统的免费支持。从那时起，该系统的用户将无法接收安全更新，这可能会使他们的计算机容易受到网络攻击。"
+inputs = tokenizer([EN_SENTENCE, RU_SENTENCE, ZH_SENTENCE], padding=True, max_length=256, return_tensors='pt')
+
+summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=100, early_stopping=True)
+print([tokenizer.decode(g) for g in summary_ids])  
+```
+### Citation
+```bibtex
+@article{yan2020prophetnet,
+  title={Prophetnet: Predicting future n-gram for sequence-to-sequence pre-training},
+  author={Yan, Yu and Qi, Weizhen and Gong, Yeyun and Liu, Dayiheng and Duan, Nan and Chen, Jiusheng and Zhang, Ruofei and Zhou, Ming},
+  journal={arXiv preprint arXiv:2001.04063},
+  year={2020}
+}
+```
diff --git a/model_cards/microsoft/xprophetnet-large-wiki100-cased-xglue-qg/README.md b/model_cards/microsoft/xprophetnet-large-wiki100-cased-xglue-qg/README.md
new file mode 100644
index 0000000000..42535ed859
--- /dev/null
+++ b/model_cards/microsoft/xprophetnet-large-wiki100-cased-xglue-qg/README.md
@@ -0,0 +1,31 @@
+## xprophetnet-large-wiki100-cased-xglue-ntg
+Cross-lingual version [ProphetNet](https://arxiv.org/abs/2001.04063), pretrained on [wiki100 xGLUE dataset](https://arxiv.org/abs/2004.01401) and finetuned on xGLUE cross-lingual Question Generation task.  
+ProphetNet is a new pre-trained language model for sequence-to-sequence learning with a novel self-supervised objective called future n-gram prediction.  
+ProphetNet is able to predict more future tokens with a n-stream decoder. The original implementation is Fairseq version at [github repo](https://github.com/microsoft/ProphetNet).   
+
+xProphetNet is also served as the baseline model for xGLUE cross-lingual natural language generation tasks.  
+For xGLUE corss-lingual NLG tasks, xProphetNet is finetuned with English data, but inference with both English and other zero-shot language data.  
+### Usage
+A quick usage is like: 
+```
+from transformers import ProphetNetTokenizer, ProphetNetForConditionalGeneration, ProphetNetConfig
+
+model = ProphetNetForConditionalGeneration.from_pretrained('microsoft/xprophetnet-large-wiki100-cased-xglue-qg')
+tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased-xglue-qg')
+
+EN_SENTENCE = "Google left China in 2010"
+ZH_SENTENCE = "Google在2010年离开中国"
+inputs = tokenizer([EN_SENTENCE, ZH_SENTENCE], padding=True, max_length=256, return_tensors='pt')
+
+summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=100, early_stopping=True)
+print([tokenizer.decode(g) for g in summary_ids])  
+```
+### Citation
+```bibtex
+@article{yan2020prophetnet,
+  title={Prophetnet: Predicting future n-gram for sequence-to-sequence pre-training},
+  author={Yan, Yu and Qi, Weizhen and Gong, Yeyun and Liu, Dayiheng and Duan, Nan and Chen, Jiusheng and Zhang, Ruofei and Zhou, Ming},
+  journal={arXiv preprint arXiv:2001.04063},
+  year={2020}
+}
+```
diff --git a/model_cards/microsoft/xprophetnet-large-wiki100-cased/README.md b/model_cards/microsoft/xprophetnet-large-wiki100-cased/README.md
new file mode 100644
index 0000000000..6e6aa29bf6
--- /dev/null
+++ b/model_cards/microsoft/xprophetnet-large-wiki100-cased/README.md
@@ -0,0 +1,19 @@
+## xprophetnet-large-wiki100-cased
+Cross-lingual version [ProphetNet](https://arxiv.org/abs/2001.04063), pretrained on [wiki100 xGLUE dataset](https://arxiv.org/abs/2004.01401).  
+ProphetNet is a new pre-trained language model for sequence-to-sequence learning with a novel self-supervised objective called future n-gram prediction.  
+ProphetNet is able to predict more future tokens with a n-stream decoder. The original implementation is Fairseq version at [github repo](https://github.com/microsoft/ProphetNet).   
+
+xProphetNet is also served as the baseline model for xGLUE cross-lingual natural language generation tasks.  
+For xGLUE corss-lingual NLG tasks, xProphetNet is finetuned with English data, but inference with both English and other zero-shot language data.  
+### Usage
+Please see [the official repository](https://github.com/microsoft/ProphetNet/tree/master/xProphetNet) for details.
+
+### Citation
+```bibtex
+@article{yan2020prophetnet,
+  title={Prophetnet: Predicting future n-gram for sequence-to-sequence pre-training},
+  author={Yan, Yu and Qi, Weizhen and Gong, Yeyun and Liu, Dayiheng and Duan, Nan and Chen, Jiusheng and Zhang, Ruofei and Zhou, Ming},
+  journal={arXiv preprint arXiv:2001.04063},
+  year={2020}
+}
+```
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 6f6c409efe..599ec72773 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -54,6 +54,7 @@ from .configuration_mmbt import MMBTConfig
 from .configuration_mobilebert import MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileBertConfig
 from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig
 from .configuration_pegasus import PegasusConfig
+from .configuration_prophetnet import PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ProphetNetConfig
 from .configuration_rag import RagConfig
 from .configuration_reformer import REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, ReformerConfig
 from .configuration_retribert import RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RetriBertConfig
@@ -63,6 +64,7 @@ from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
 from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig
 from .configuration_utils import PretrainedConfig
 from .configuration_xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig
+from .configuration_xlm_prophetnet import XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMProphetNetConfig
 from .configuration_xlm_roberta import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMRobertaConfig
 from .configuration_xlnet import XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNetConfig
 from .data import (
@@ -183,6 +185,7 @@ from .tokenization_lxmert import LxmertTokenizer
 from .tokenization_mobilebert import MobileBertTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_phobert import PhobertTokenizer
+from .tokenization_prophetnet import ProphetNetTokenizer
 from .tokenization_rag import RagTokenizer
 from .tokenization_retribert import RetriBertTokenizer
 from .tokenization_roberta import RobertaTokenizer
@@ -210,6 +213,7 @@ if is_sentencepiece_available():
     from .tokenization_pegasus import PegasusTokenizer
     from .tokenization_reformer import ReformerTokenizer
     from .tokenization_t5 import T5Tokenizer
+    from .tokenization_xlm_prophetnet import XLMProphetNetTokenizer
     from .tokenization_xlm_roberta import XLMRobertaTokenizer
     from .tokenization_xlnet import XLNetTokenizer
 else:
@@ -251,7 +255,6 @@ if is_tokenizers_available():
 else:
     from .utils.dummy_tokenizers_objects import *
 
-
 # Trainer
 from .trainer_callback import (
     DefaultFlowCallback,
@@ -490,6 +493,15 @@ if is_torch_available():
         load_tf_weights_in_openai_gpt,
     )
     from .modeling_pegasus import PegasusForConditionalGeneration
+    from .modeling_prophetnet import (
+        PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST,
+        ProphetNetDecoder,
+        ProphetNetEncoder,
+        ProphetNetForCausalLM,
+        ProphetNetForConditionalGeneration,
+        ProphetNetModel,
+        ProphetNetPreTrainedModel,
+    )
     from .modeling_rag import RagModel, RagSequenceForGeneration, RagTokenForGeneration
     from .modeling_reformer import (
         REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
@@ -550,6 +562,14 @@ if is_torch_available():
         XLMPreTrainedModel,
         XLMWithLMHeadModel,
     )
+    from .modeling_xlm_prophetnet import (
+        XLM_PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST,
+        XLMProphetNetDecoder,
+        XLMProphetNetEncoder,
+        XLMProphetNetForCausalLM,
+        XLMProphetNetForConditionalGeneration,
+        XLMProphetNetModel,
+    )
     from .modeling_xlm_roberta import (
         XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
         XLMRobertaForCausalLM,
diff --git a/src/transformers/configuration_auto.py b/src/transformers/configuration_auto.py
index d0aecaf739..99371fca7b 100644
--- a/src/transformers/configuration_auto.py
+++ b/src/transformers/configuration_auto.py
@@ -41,6 +41,7 @@ from .configuration_mbart import MBART_PRETRAINED_CONFIG_ARCHIVE_MAP, MBartConfi
 from .configuration_mobilebert import MobileBertConfig
 from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig
 from .configuration_pegasus import PegasusConfig
+from .configuration_prophetnet import PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ProphetNetConfig
 from .configuration_rag import RagConfig
 from .configuration_reformer import ReformerConfig
 from .configuration_retribert import RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RetriBertConfig
@@ -50,6 +51,7 @@ from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
 from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig
 from .configuration_utils import PretrainedConfig
 from .configuration_xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig
+from .configuration_xlm_prophetnet import XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMProphetNetConfig
 from .configuration_xlm_roberta import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMRobertaConfig
 from .configuration_xlnet import XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNetConfig
 
@@ -84,6 +86,8 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
         DPR_PRETRAINED_CONFIG_ARCHIVE_MAP,
         DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
         SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
     ]
     for key, value, in pretrained_map.items()
 )
@@ -116,6 +120,8 @@ CONFIG_MAPPING = OrderedDict(
         ("gpt2", GPT2Config),
         ("transfo-xl", TransfoXLConfig),
         ("xlnet", XLNetConfig),
+        ("xlm-prophetnet", XLMProphetNetConfig),
+        ("prophetnet", ProphetNetConfig),
         ("xlm", XLMConfig),
         ("ctrl", CTRLConfig),
         ("electra", ElectraConfig),
@@ -164,6 +170,8 @@ MODEL_NAMES_MAPPING = OrderedDict(
         ("layoutlm", "LayoutLM"),
         ("dpr", "DPR"),
         ("rag", "RAG"),
+        ("xlm-prophetnet", "XLMProphetNet"),
+        ("prophetnet", "ProphetNet"),
     ]
 )
 
diff --git a/src/transformers/configuration_lxmert.py b/src/transformers/configuration_lxmert.py
index 4e5bc8d5b2..ce60c1fe46 100644
--- a/src/transformers/configuration_lxmert.py
+++ b/src/transformers/configuration_lxmert.py
@@ -15,12 +15,11 @@
 """ LXMERT model configuration """
 
 
-import logging
-
 from .configuration_utils import PretrainedConfig
+from .utils import logging
 
 
-logger = logging.getLogger(__name__)
+logger = logging.get_logger(__name__)
 
 LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     "unc-nlp/lxmert-base-uncased": "",
diff --git a/src/transformers/configuration_prophetnet.py b/src/transformers/configuration_prophetnet.py
new file mode 100644
index 0000000000..85ebe3c5d5
--- /dev/null
+++ b/src/transformers/configuration_prophetnet.py
@@ -0,0 +1,167 @@
+# coding=utf-8
+# Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ProphetNet model configuration """
+
+
+from .configuration_utils import PretrainedConfig
+from .utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "microsoft/prophetnet-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/microsoft/prophetnet-large-uncased/config.json",
+}
+
+
+class ProphetNetConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.ProphetNetModel`. It is used to
+    instantiate a ProphetNet model according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
+
+    Args:
+        activation_dropout (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout ratio for activations inside the fully connected layer.
+        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+            Vocabulary size of the ProphetNET model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.ProphetNetModel`.
+        hidden_size (:obj:`int`, `optional`, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        num_encoder_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of encoder layers.
+        num_encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+            Dimensionality of the ``intermediate`` (often named feed-forward) layer in decoder.
+        num_decoder_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of decoder layers.
+        num_decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        init_std (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        add_cross_attention (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether cross-attention layers should be added to the model.
+        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether this is an encoder/decoder model.
+        pad_token_id (:obj:`int`, `optional`, defaults to 1)
+            Padding token id.
+        bos_token_id (:obj:`int`, `optional`, defaults to 0)
+            Beginning of stream token id.
+        eos_token_id (:obj:`int`, `optional`, defaults to 2)
+            End of stream token id.
+        ngram (:obj:`int`, `optional`, defaults to 2)
+            Number of future tokens to predict.
+            Set to 1 to be same as traditional Language model to predict next first token.
+        num_buckets (:obj:`int`, `optional`, defaults to 32)
+            The number of buckets to use for each attention layer.
+            This is for relative position calculation. See the `T5 paper
+            <see https://arxiv.org/abs/1910.10683>`__ for more details.
+        relative_max_distance (:obj:`int`, `optional`, defaults to 128)
+            Relative distances greater than this number will be put into the last same bucket.
+            This is for relative position calculation. See the `T5 paper
+            <see https://arxiv.org/abs/1910.10683>`__ for more details.
+        disable_ngram_loss (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether be trained predicting only the next first token.
+        eps (:obj:`float`, `optional`, defaults to 0.0):
+            Controls the ``epsilon`` parameter value for label
+            smoothing in the loss calculation. If set to 0, no label smoothing is performed.
+    """
+    model_type = "prophetnet"
+
+    def __init__(
+        self,
+        activation_dropout=0.1,
+        activation_function="gelu",
+        vocab_size=30522,
+        hidden_size=1024,
+        encoder_ffn_dim=4096,
+        num_encoder_layers=12,
+        num_encoder_attention_heads=16,
+        decoder_ffn_dim=4096,
+        num_decoder_layers=12,
+        num_decoder_attention_heads=16,
+        attention_dropout=0.1,
+        dropout=0.1,
+        max_position_embeddings=512,
+        init_std=0.02,
+        is_encoder_decoder=True,
+        add_cross_attention=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        decoder_start_token_id=0,
+        ngram=2,
+        num_buckets=32,
+        relative_max_distance=128,
+        disable_ngram_loss=False,
+        eps=0.0,
+        **kwargs
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            add_cross_attention=add_cross_attention,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.num_encoder_layers = num_encoder_layers
+        self.num_encoder_attention_heads = num_encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.num_decoder_layers = num_decoder_layers
+        self.num_decoder_attention_heads = num_decoder_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.init_std = init_std  # Normal(0, this parameter)
+        self.activation_function = activation_function
+
+        # parameters for prophetnet
+        self.ngram = ngram
+        self.num_buckets = num_buckets
+        self.relative_max_distance = relative_max_distance
+        self.disable_ngram_loss = disable_ngram_loss
+        self.eps = eps
+
+        # 3 Types of Dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.dropout = dropout
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.num_encoder_attention_heads
+
+    @property
+    def num_hidden_layers(self) -> int:
+        return self.num_encoder_layers + self.num_decoder_layers
diff --git a/src/transformers/configuration_xlm_prophetnet.py b/src/transformers/configuration_xlm_prophetnet.py
new file mode 100644
index 0000000000..1641b9ceed
--- /dev/null
+++ b/src/transformers/configuration_xlm_prophetnet.py
@@ -0,0 +1,35 @@
+# coding=utf-8
+# Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" XLM-ProphetNet model configuration """
+
+
+from .configuration_prophetnet import ProphetNetConfig
+from .utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "microsoft/xprophetnet-large-wiki100-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/microsoft/xprophetnet-large-wiki100-cased/config.json",
+}
+
+
+class XLMProphetNetConfig(ProphetNetConfig):
+    """
+    This class overrides :class:`~transformers.ProphetNetConfig`. Please check the
+    superclass for the appropriate documentation alongside usage examples.
+    """
+
+    model_type = "xlm-prophetnet"
diff --git a/src/transformers/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000..e6f9e78c7a
--- /dev/null
+++ b/src/transformers/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,162 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert ProphetNet checkpoint."""
+
+
+import argparse
+
+import torch
+
+from transformers import logging
+from transformers.modeling_prophetnet import ProphetNetForConditionalGeneration
+from transformers.modeling_xlm_prophetnet import XLMProphetNetForConditionalGeneration
+
+# transformers_old should correspond to branch `save_old_prophetnet_model_structure` here
+# original prophetnet_checkpoints are saved under `patrickvonplaten/..._old` respectively
+from transformers_old.modeling_prophetnet import (
+    ProphetNetForConditionalGeneration as ProphetNetForConditionalGenerationOld,
+)
+from transformers_old.modeling_xlm_prophetnet import (
+    XLMProphetNetForConditionalGeneration as XLMProphetNetForConditionalGenerationOld,
+)
+
+
+logger = logging.get_logger(__name__)
+logging.set_verbosity_info()
+
+
+def convert_prophetnet_checkpoint_to_pytorch(prophetnet_checkpoint_path: str, pytorch_dump_folder_path: str):
+    """
+    Copy/paste/tweak prohpetnet's weights to our prophetnet structure.
+    """
+    if "xprophetnet" in prophetnet_checkpoint_path:
+        prophet_old = XLMProphetNetForConditionalGenerationOld.from_pretrained(prophetnet_checkpoint_path)
+        prophet, loading_info = XLMProphetNetForConditionalGeneration.from_pretrained(
+            prophetnet_checkpoint_path, output_loading_info=True
+        )
+    else:
+        prophet_old = ProphetNetForConditionalGenerationOld.from_pretrained(prophetnet_checkpoint_path)
+        prophet, loading_info = ProphetNetForConditionalGeneration.from_pretrained(
+            prophetnet_checkpoint_path, output_loading_info=True
+        )
+
+    special_keys = ["key_proj", "value_proj", "query_proj"]
+
+    mapping = {
+        "self_attn": "ngram_self_attn",
+        "cross_attn": "encoder_attn",
+        "cross_attn_layer_norm": "encoder_attn_layer_norm",
+        "feed_forward_layer_norm": "final_layer_norm",
+        "feed_forward": "",
+        "intermediate": "fc1",
+        "output": "fc2",
+        "key_proj": "k_proj",
+        "query_proj": "q_proj",
+        "value_proj": "v_proj",
+        "word_embeddings": "embed_tokens",
+        "embeddings_layer_norm": "emb_layer_norm",
+        "relative_pos_embeddings": "relative_linear",
+        "ngram_embeddings": "ngram_input_embed",
+        "position_embeddings": "embed_positions",
+    }
+
+    for key in loading_info["missing_keys"]:
+        attributes = key.split(".")
+
+        if attributes[0] == "lm_head":
+            model = prophet
+            old_model = prophet_old
+        else:
+            model = prophet.prophetnet
+            old_model = prophet_old.model
+
+        is_key_init = False
+        for attribute in attributes:
+            if attribute in mapping:
+                old_attribute = mapping[attribute]
+                if not hasattr(old_model, old_attribute) and len(old_attribute) > 0:
+                    old_attribute = attribute
+            elif hasattr(old_model, attribute):
+                old_attribute = attribute
+
+            if attribute == "weight":
+                assert old_model.weight.shape == model.weight.shape, "Shapes have to match!"
+                model.weight = old_model.weight
+                logger.info(f"{attribute} is initialized.")
+                is_key_init = True
+                break
+            elif attribute == "bias":
+                assert old_model.bias.shape == model.bias.shape, "Shapes have to match!"
+                model.bias = old_model.bias
+                logger.info(f"{attribute} is initialized")
+                is_key_init = True
+                break
+            elif attribute in special_keys and hasattr(old_model, "in_proj_weight"):
+                embed_dim = old_model.in_proj_weight.shape[0] // 3
+                param = getattr(model, attribute)
+                param.weight.shape == old_model.in_proj_weight[:embed_dim, :].shape, "Shapes have to match"
+                param.bias.shape == old_model.in_proj_bias[:embed_dim].shape, "Shapes have to match"
+                if attribute == "query_proj":
+                    model.query_proj.weight = torch.nn.Parameter(old_model.in_proj_weight[:embed_dim, :])
+                    model.query_proj.bias = torch.nn.Parameter(old_model.in_proj_bias[:embed_dim])
+
+                elif attribute == "key_proj":
+                    model.key_proj.weight = torch.nn.Parameter(old_model.in_proj_weight[embed_dim : 2 * embed_dim, :])
+                    model.key_proj.bias = torch.nn.Parameter(old_model.in_proj_bias[embed_dim : 2 * embed_dim])
+                elif attribute == "value_proj":
+                    model.value_proj.weight = torch.nn.Parameter(old_model.in_proj_weight[2 * embed_dim :, :])
+                    model.value_proj.bias = torch.nn.Parameter(old_model.in_proj_bias[2 * embed_dim :])
+                is_key_init = True
+                break
+            elif attribute == "position_embeddings":
+                assert (
+                    model.position_embeddings.weight.shape[-1] == old_model.embed_positions.weight.shape[-1]
+                ), "Hidden size has to match"
+                assert model.position_embeddings.weight.shape[0] == 512, "We want 512 position_embeddings."
+                model.position_embeddings.weight = torch.nn.Parameter(old_model.embed_positions.weight[:512, :])
+                is_key_init = True
+                break
+
+            if attribute.isdigit():
+                model = model[int(attribute)]
+                old_model = old_model[int(old_attribute)]
+            else:
+                model = getattr(model, attribute)
+
+                if old_attribute == "":
+                    old_model = old_model
+                else:
+                    if not hasattr(old_model, old_attribute):
+                        raise ValueError(f"{old_model} does not have {old_attribute}")
+                    old_model = getattr(old_model, old_attribute)
+
+        if not is_key_init:
+            raise ValueError(f"{key} was not correctly initialized!")
+
+    print(f"Saving model to {pytorch_dump_folder_path}")
+    prophet.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--prophetnet_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    args = parser.parse_args()
+    convert_prophetnet_checkpoint_to_pytorch(args.prophetnet_checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/modeling_auto.py b/src/transformers/modeling_auto.py
index 9a2a4584a6..7369e62363 100644
--- a/src/transformers/modeling_auto.py
+++ b/src/transformers/modeling_auto.py
@@ -43,6 +43,7 @@ from .configuration_auto import (
     MobileBertConfig,
     OpenAIGPTConfig,
     PegasusConfig,
+    ProphetNetConfig,
     ReformerConfig,
     RetriBertConfig,
     RobertaConfig,
@@ -50,6 +51,7 @@ from .configuration_auto import (
     T5Config,
     TransfoXLConfig,
     XLMConfig,
+    XLMProphetNetConfig,
     XLMRobertaConfig,
     XLNetConfig,
     replace_list_option_in_docstrings,
@@ -155,6 +157,7 @@ from .modeling_mobilebert import (
 )
 from .modeling_openai import OpenAIGPTForSequenceClassification, OpenAIGPTLMHeadModel, OpenAIGPTModel
 from .modeling_pegasus import PegasusForConditionalGeneration
+from .modeling_prophetnet import ProphetNetForCausalLM, ProphetNetForConditionalGeneration, ProphetNetModel
 from .modeling_rag import (  # noqa: F401 - need to import all RagModels to be in globals() function
     RagModel,
     RagSequenceForGeneration,
@@ -194,6 +197,11 @@ from .modeling_xlm import (
     XLMModel,
     XLMWithLMHeadModel,
 )
+from .modeling_xlm_prophetnet import (
+    XLMProphetNetForCausalLM,
+    XLMProphetNetForConditionalGeneration,
+    XLMProphetNetModel,
+)
 from .modeling_xlm_roberta import (
     XLMRobertaForCausalLM,
     XLMRobertaForMaskedLM,
@@ -247,6 +255,8 @@ MODEL_MAPPING = OrderedDict(
         (BertGenerationConfig, BertGenerationEncoder),
         (DebertaConfig, DebertaModel),
         (DPRConfig, DPRQuestionEncoder),
+        (XLMProphetNetConfig, XLMProphetNetModel),
+        (ProphetNetConfig, ProphetNetModel),
     ]
 )
 
@@ -325,6 +335,8 @@ MODEL_FOR_CAUSAL_LM_MAPPING = OrderedDict(
         (CTRLConfig, CTRLLMHeadModel),
         (ReformerConfig, ReformerModelWithLMHead),
         (BertGenerationConfig, BertGenerationDecoder),
+        (ProphetNetConfig, XLMProphetNetForCausalLM),
+        (ProphetNetConfig, ProphetNetForCausalLM),
     ]
 )
 
@@ -359,6 +371,8 @@ MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict(
         (BartConfig, BartForConditionalGeneration),
         (FSMTConfig, FSMTForConditionalGeneration),
         (EncoderDecoderConfig, EncoderDecoderModel),
+        (XLMProphetNetConfig, XLMProphetNetForConditionalGeneration),
+        (ProphetNetConfig, ProphetNetForConditionalGeneration),
     ]
 )
 
diff --git a/src/transformers/modeling_lxmert.py b/src/transformers/modeling_lxmert.py
index e5d0c271e2..98ab0b9d16 100644
--- a/src/transformers/modeling_lxmert.py
+++ b/src/transformers/modeling_lxmert.py
@@ -15,7 +15,6 @@
 """ PyTorch LXMERT model. """
 
 
-import logging
 import math
 import os
 from dataclasses import dataclass
@@ -35,9 +34,10 @@ from .file_utils import (
     replace_return_docstrings,
 )
 from .modeling_utils import PreTrainedModel
+from .utils import logging
 
 
-logger = logging.getLogger(__name__)
+logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "LxmertConfig"
 _TOKENIZER_FOR_DOC = "LxmertTokenizer"
diff --git a/src/transformers/modeling_prophetnet.py b/src/transformers/modeling_prophetnet.py
new file mode 100644
index 0000000000..47ea468cb6
--- /dev/null
+++ b/src/transformers/modeling_prophetnet.py
@@ -0,0 +1,2052 @@
+# coding=utf-8
+# Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch ProphetNet model, ported from ProphetNet repo(fairsequery_states version). """
+
+import copy
+import math
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+from .activations import ACT2FN
+from .configuration_prophetnet import ProphetNetConfig
+from .file_utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_callable, replace_return_docstrings
+from .modeling_outputs import BaseModelOutput
+from .modeling_utils import PreTrainedModel
+from .utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "ProphenetConfig"
+_TOKENIZER_FOR_DOC = "ProphetNetTokenizer"
+
+PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "microsoft/prophetnet-large-uncased",
+    # See all ProphetNet models at https://huggingface.co/models?filter=prophetnet
+]
+
+
+PROPHETNET_START_DOCSTRING = r"""
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    Original ProphetNet code can be found at <https://github.com/microsoft/ProphetNet> .
+    Checkpoints were converted from original Fairseq checkpoints. For more information on
+    the checkpoint conversion, please take a look at the file ``convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py``.
+
+    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matters related to general usage and behavior.
+
+    Parameters:
+        config (:class:`~transformers.ProphetNetConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+PROPHETNET_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using :class:`~transformers.ProphetNetTokenizer`.
+            See :meth:`transformers.PreTrainedTokenizer.encode` and
+            :meth:`transformers.PreTrainedTokenizer.__call__` for details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+            Provide for translation and summarization training. By default, the model will create this tensor by
+            shifting the :obj:`input_ids` to the right, following the paper.
+        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+            Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+            also be used by default.
+
+            If you want to change padding behavior, you should read :func:`modeling_bart._prepare_decoder_inputs` and
+            modify to your needs. See diagram 1 in `the paper <https://arxiv.org/abs/1910.13461>`__ for more
+            information on the default strategy.
+        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
+            Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`: :obj:`attentions`)
+            :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`) is a
+            sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
+            the decoder.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last
+            ``decoder_input_ids`` (those that don't have their past key value states given to this model) of shape
+            :obj:`(batch_size, 1)` instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+PROPHETNET_STANDALONE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using :class:`~transformers.ProphetNetTokenizer`.
+            See :meth:`transformers.PreTrainedTokenizer.encode` and
+            :meth:`transformers.PreTrainedTokenizer.__call__` for details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+
+def softmax(hidden_state, dim, onnx_trace=False):
+    if onnx_trace:
+        return F.softmax(hidden_state.float(), dim=dim)
+    else:
+        return F.softmax(hidden_state, dim=dim, dtype=torch.float32)
+
+
+def ngram_attention_bias(sequence_length, ngram, device, dtype):
+    """
+    This function computes the bias for the predict stream
+    """
+    bias = torch.ones((ngram, sequence_length, 2 * sequence_length), device=device, dtype=dtype) * float("-inf")
+    # create bias
+    for stream_idx in range(ngram):
+        for i in range(sequence_length):
+            bias[stream_idx, i, sequence_length + i] = 0
+            bias[stream_idx, i, : max(i - stream_idx, 0) + 1] = 0
+    return bias
+
+
+def compute_relative_buckets(num_buckets, max_distance, relative_positions, is_bidirectional=False):
+    """
+    This function computes individual parts of the relative position buckets. For more detail, see paper.
+    """
+    inv_relative_positions = -relative_positions
+    rel_positions_bucket = 0
+
+    if is_bidirectional:
+        num_buckets = num_buckets // 2
+        rel_positions_bucket = (
+            rel_positions_bucket
+            + torch.lt(inv_relative_positions, torch.zeros_like(inv_relative_positions)).int() * num_buckets
+        )
+        inv_relative_positions = torch.abs(inv_relative_positions)
+    else:
+        inv_relative_positions = torch.max(inv_relative_positions, torch.zeros_like(inv_relative_positions))
+
+    max_exact = num_buckets // 2
+    is_small = torch.lt(inv_relative_positions, max_exact)
+    val_if_large = max_exact + torch.log(inv_relative_positions.float() / max_exact) / math.log(
+        max_distance / max_exact
+    ) * (num_buckets - max_exact)
+    val_if_large = torch.min(val_if_large, torch.ones_like(val_if_large) * (num_buckets - 1)).int()
+    rel_positions_bucket = rel_positions_bucket + torch.where(is_small, inv_relative_positions.int(), val_if_large)
+    return rel_positions_bucket
+
+
+def compute_all_stream_relative_buckets(num_buckets, max_distance, position_ids):
+    """
+    This function computes both main and predict relative position buckets. For more detail, see paper.
+    """
+    # main stream
+    main_stream_relative_positions = position_ids.unsqueeze(1).repeat(1, position_ids.size(-1), 1)
+    main_stream_relative_positions = main_stream_relative_positions - position_ids.unsqueeze(-1)
+
+    # predicting stream
+    predicting_stream_relative_positions = torch.cat((position_ids - 1, position_ids), dim=-1).unsqueeze(1)
+    predicting_stream_relative_positions = predicting_stream_relative_positions.repeat(1, position_ids.size(-1), 1)
+    predicting_stream_relative_positions = predicting_stream_relative_positions - position_ids.unsqueeze(-1)
+
+    # get both position buckets
+    main_relative_position_buckets = compute_relative_buckets(
+        num_buckets, max_distance, main_stream_relative_positions, is_bidirectional=False
+    )
+    predict_relative_position_buckets = compute_relative_buckets(
+        num_buckets, max_distance, predicting_stream_relative_positions, is_bidirectional=False
+    )
+    return main_relative_position_buckets, predict_relative_position_buckets
+
+
+@dataclass
+class ProphetNetSeq2SeqLMOutput(ModelOutput):
+    """
+    Base class for sequence-to-sequence language models outputs.
+
+    Args:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+            Languaged modeling loss.
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, decoder_sequence_length, config.vocab_size)`):
+            Prediction scores of the main stream language modeling head
+            (scores for each vocabulary token before SoftMax).
+        logits_ngram (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
+            Prediction scores of the predict stream language modeling head (scores for
+            each vocabulary token before SoftMax).
+        past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
+            List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`,  with each tensor of shape
+            :obj:`(2, batch_size, num_attn_heads, decoder_sequence_length, embed_size_per_head)`).
+
+            Contains pre-computed hidden-states (key and values in the attention blocks)
+            of the decoder that can be
+            used (see :obj:`past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings
+            + one for the output of each layer)
+            of shape :obj:`(batch_size, decoder_sequence_length, hidden_size)`.
+
+            Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_ngram_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, ngram * decoder_sequence_length, hidden_size)`.
+
+            Hidden-states of the predict stream of the decoder at the
+            output of each layer plus the initial embedding outputs.
+        decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_attn_heads, decoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        decoder_ngram_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_attn_heads, decoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the predict stream of the decoder, after the attention softmax,
+            used to compute the weighted average in the self-attention heads.
+        decoder_cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_attn_heads, encoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the cross-attention layer of the decoder,
+            after the attention softmax, used to compute the weighted average in the
+        encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the
+            encoder of the model.
+        encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_attn_heads, encoder_sequence_length, encoder_sequence_length)`.
+            Attentions weights of the encoder, after the attention softmax, used
+            to compute the weighted average in the
+            self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    logits_ngram: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_ngram_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_ngram_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class ProphetNetSeq2SeqModelOutput(ModelOutput):
+    """
+    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
+    decoding.
+
+    Args:
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, decoder_sequence_length, hidden_size)`):
+            Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.
+
+            If :obj:`past_key_values` is used only the last hidden-state of
+            the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output.
+        last_hidden_state_ngram (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,ngram * decoder_sequence_length, config.vocab_size)`):
+            Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
+        past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
+            List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`,  with each tensor of shape
+            :obj:`(2, batch_size, num_attn_heads, decoder_sequence_length, embed_size_per_head)`).
+
+            Contains pre-computed hidden-states (key and values in the attention blocks)
+            of the decoder that can be
+            used (see :obj:`past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, decoder_sequence_length, hidden_size)`.
+
+            Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_ngram_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, ngram * decoder_sequence_length, hidden_size)`.
+
+            Hidden-states of the predict stream of the decoder at the
+            output of each layer plus the initial embedding outputs.
+        decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_attn_heads, decoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        decoder_ngram_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_attn_heads, decoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the predict stream of the decoder,
+            after the attention softmax, used to compute the weighted average in the
+        decoder_cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_attn_heads, encoder_sequence_length,
+            decoder_sequence_length)`.
+
+            Attentions weights of the cross-attention layer of the decoder,
+            after the attention softmax, used to compute the weighted average in the
+        encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_attn_heads, encoder_sequence_length, encoder_sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+    """
+
+    last_hidden_state: torch.FloatTensor
+    last_hidden_state_ngram: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_ngram_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_ngram_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class ProphetNetDecoderModelOutput(ModelOutput):
+    """
+    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+    Args:
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, decoder_sequence_length, hidden_size)`):
+            Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.
+
+            If :obj:`past_key_values` is used only the last hidden-state
+            of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output.
+        last_hidden_state_ngram (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
+            Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
+        past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
+            List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`,  with each tensor of shape
+            :obj:`(2, batch_size, num_attn_heads, decoder_sequence_length, embed_size_per_head)`).
+
+            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
+            used (see :obj:`past_key_values` input) to speed up sequential decoding.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, decoder_sequence_length, hidden_size)`.
+
+            Hidden-states of main stream of the decoder at the output
+            of each layer plus the initial embedding outputs.
+        ngram_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, ngram * decoder_sequence_length, hidden_size)`.
+
+            Hidden-states of the predict stream of the decoder at the
+            output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_attn_heads, decoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax,
+            used to compute the weighted average in the
+            self-attention heads.
+        ngram_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_attn_heads, decoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the predict stream of the decoder,
+            after the attention softmax, used to compute the weighted average in the
+        cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_attn_heads, encoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to compute the weighted average in the
+    """
+
+    last_hidden_state: torch.FloatTensor
+    last_hidden_state_ngram: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Tuple[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    hidden_states_ngram: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    ngram_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class ProphetNetDecoderLMOutput(ModelOutput):
+    """
+    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+    Args:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+            Languaged modeling loss.
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, decoder_sequence_length, config.vocab_size)`):
+            Prediction scores of the main stream language modeling head (scores for each vocabulary token before SoftMax).
+        logits_ngram (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
+            Prediction scores of the predict stream language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
+            List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`,  with each tensor of shape
+            :obj:`(2, batch_size, num_attn_heads, decoder_sequence_length, embed_size_per_head)`).
+
+            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
+            used (see :obj:`past_key_values` input) to speed up sequential decoding.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, decoder_sequence_length, hidden_size)`.
+
+            Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
+        ngram_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, ngram * decoder_sequence_length, hidden_size)`.
+
+            Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_attn_heads, decoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        ngram_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_attn_heads, decoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the weighted average in the
+        cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_attn_heads, encoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to compute the weighted average in the
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    logits_ngram: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Tuple[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    hidden_states_ngram: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    ngram_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+def ProphetNetLayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True):
+    if torch.cuda.is_available():
+        try:
+            from apex.normalization import FusedProphetNetLayerNorm
+
+            return FusedProphetNetLayerNorm(normalized_shape, eps, elementwise_affine)
+        except ImportError:
+            pass
+    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
+
+
+class ProphetNetPreTrainedModel(PreTrainedModel):
+    config_class = ProphetNetConfig
+    base_model_prefix = "prophetnet"
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.init_std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.init_std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _shift_right(self, input_ids):
+        decoder_start_token_id = self.config.decoder_start_token_id
+        pad_token_id = self.config.pad_token_id
+
+        assert (
+            decoder_start_token_id is not None
+        ), "self.model.config.decoder_start_token_id has to be defined. In ProphetNet it is usually set to the pad_token_id. See ProphetNet docs for more information"
+
+        # shift inputs to the right
+        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+        shifted_input_ids[..., 0] = decoder_start_token_id
+
+        assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+        assert torch.all(shifted_input_ids >= 0).item(), "Verify that `shifted_input_ids` has only positive values"
+
+        return shifted_input_ids
+
+
+class ProhpetNetPositionalEmbeddings(nn.Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    Padding ids are ignored by either offsetting based on padding_idx
+    or by setting padding_idx to None and ensuring that the appropriate
+    position ids are passed to the forward function.
+    """
+
+    def __init__(self, config: ProphetNetConfig):
+        super().__init__(config.max_position_embeddings, config.hidden_size, config.pad_token_id)
+
+    def forward(self, inputs_shape, device, attention_mask=None, past_key_values=None, position_ids=None):
+        assert (position_ids is None) or (
+            self.padding_idx is None
+        ), "If position_ids is pre-computed then padding_idx should not be set."
+
+        if position_ids is None:
+            if past_key_values is not None:
+                # position_ids is the same for every token when decoding a single step
+                # Without the int() cast, it doesn't work in some cases when exporting to ONNX
+                prev_num_input_ids = past_key_values[0]["self"]["prev_key_states"].shape[2]
+                num_input_ids = inputs_shape[1] + prev_num_input_ids
+                position_ids = torch.ones((1, 1), dtype=torch.long, device=device) * (
+                    int(self.padding_idx + num_input_ids)
+                )
+            else:
+                if attention_mask is None:
+                    attention_mask = torch.ones(inputs_shape, dtype=torch.long, device=device)
+
+                # retrieve position_ids from input_ids / attention_mask
+                position_ids = (
+                    torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask
+                ).long() + self.padding_idx
+
+        return super().forward(position_ids), position_ids
+
+    def _forward(self, position_ids):
+        return super().forward(position_ids)
+
+
+class ProphetNetSelfAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        config: ProphetNetConfig,
+        num_attn_heads: int,
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+
+        self.attention_dropout = config.attention_dropout
+        self.dropout = config.dropout
+        self.num_attn_heads = num_attn_heads
+        self.head_dim = hidden_size // num_attn_heads
+
+        assert (
+            self.head_dim * num_attn_heads == hidden_size
+        ), "`config.hidden_size` must be divisible by `config.num_encoder_attention_heads` and `config.num_decoder_attention_heads`"
+
+        self.key_proj = nn.Linear(hidden_size, hidden_size)
+        self.value_proj = nn.Linear(hidden_size, hidden_size)
+        self.query_proj = nn.Linear(hidden_size, hidden_size)
+
+        self.out_proj = nn.Linear(hidden_size, hidden_size)
+
+    def _reshape(self, tensor, first_dim, batch_size):
+        return tensor.reshape(first_dim, batch_size * self.num_attn_heads, self.head_dim).transpose(0, 1)
+
+    def forward(
+        self,
+        hidden_states,
+        key_value_states: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        layer_state: Optional[Dict[str, Optional[Tensor]]] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+
+        sequence_length, batch_size, hidden_size = hidden_states.size()
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        cache_key = "cross_attention" if is_cross_attention else "self"
+        assert list(hidden_states.size()) == [
+            sequence_length,
+            batch_size,
+            hidden_size,
+        ], f"Size of hidden states should be {sequence_length, batch_size, hidden_size}, but is {hidden_states.size()}"
+
+        # previous time steps are cached - no need to recompute key and value if they are static
+        if layer_state is not None:
+            saved_state = layer_state.get(cache_key, None)
+
+        query_states = self.query_proj(hidden_states) / (self.head_dim ** 0.5)
+        query_states = self._reshape(query_states, sequence_length, batch_size)
+
+        if not is_cross_attention:
+            # self-attention
+            key_states = self.key_proj(hidden_states)
+            key_states = self._reshape(key_states, -1, batch_size)
+            value_states = self.value_proj(hidden_states)
+            value_states = self._reshape(value_states, -1, batch_size)
+        elif saved_state is None:
+            # cross-attention without layer state
+            key_states = self.key_proj(key_value_states)
+            key_states = self._reshape(key_states, -1, batch_size)
+            value_states = self.value_proj(key_value_states)
+            value_states = self._reshape(value_states, -1, batch_size)
+        else:
+            key_states = saved_state["prev_key_states"].view(batch_size * self.num_attn_heads, -1, self.head_dim)
+            value_states = saved_state["prev_value_states"].view(batch_size * self.num_attn_heads, -1, self.head_dim)
+
+        # Update cache
+        if is_cross_attention:
+            layer_state[cache_key] = {
+                "prev_key_states": key_states.view(batch_size, self.num_attn_heads, -1, self.head_dim),
+                "prev_value_states": value_states.view(batch_size, self.num_attn_heads, -1, self.head_dim),
+            }
+
+        key_sequence_length = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+        assert attn_weights.size() == (
+            batch_size * self.num_attn_heads,
+            sequence_length,
+            key_sequence_length,
+        ), f"`attn_weights` should be of size {batch_size * self.num_attn_heads, sequence_length, key_sequence_length}, but is of size {attn_weights.shape}"
+
+        # This is part of a workaround to get around fork/join parallelism not supporting Optional types.
+        if attention_mask is not None and attention_mask.dim() == 0:
+            attention_mask = None
+        assert attention_mask is None or attention_mask.size() == (
+            self.num_attn_heads * batch_size,
+            1,
+            key_sequence_length,
+        ), f"`attention_mask` should be `None` or of shape attention_mask.size() == {batch_size * self.num_attn_heads, 1, key_sequence_length}, but is {attention_mask.shape}"
+
+        if attention_mask is not None:  # don't attend to padding symbols
+            attn_weights = attn_weights + attention_mask
+
+        attn_weights = F.softmax(attn_weights, dim=-1)
+        attn_probs = F.dropout(
+            attn_weights,
+            p=self.attention_dropout,
+            training=self.training,
+        )
+
+        attn_output = torch.bmm(attn_probs, value_states)
+        assert attn_output.size() == (
+            batch_size * self.num_attn_heads,
+            sequence_length,
+            self.head_dim,
+        ), "`attn_output` should be of shape {batch_size * self.num_attn_heads, sequence_length, self.head_dim}, but is of shape {attn_output.size()}"
+        attn_output = attn_output.transpose(0, 1).contiguous().view(sequence_length, batch_size, hidden_size)
+
+        attn_output = self.out_proj(attn_output)
+
+        attn_weights = attn_weights.view(batch_size, self.num_attn_heads, sequence_length, key_sequence_length)
+        attn_output = F.dropout(attn_output, p=self.dropout, training=self.training)
+        return attn_output, attn_weights
+
+
+class ProhpetNetFeedForward(nn.Module):
+    """
+    This is the residual two feed-forward layer block based on the original
+    Transformer implementation.
+    """
+
+    def __init__(self, config: ProphetNetConfig, ffn_dim: int):
+        super().__init__()
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.intermediate = nn.Linear(config.hidden_size, ffn_dim)
+        self.output = nn.Linear(ffn_dim, config.hidden_size)
+        self.activation_dropout = config.activation_dropout
+        self.dropout = config.dropout
+
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+
+        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.output(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        return hidden_states
+
+
+class ProphetNetNgramProphetNetSelfAttention(nn.Module):
+    def __init__(self, config: ProphetNetConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.num_buckets = config.num_buckets
+        self.relative_max_distance = config.relative_max_distance
+        self.num_attn_heads = config.num_attention_heads
+        self.dropout = config.dropout
+        self.attention_dropout = config.attention_dropout
+        self.head_dim = config.hidden_size // self.num_attn_heads
+        self.ngram = config.ngram
+
+        assert (
+            self.head_dim * self.num_attn_heads == config.hidden_size
+        ), "config.hidden_size must be divisible by num_attn_heads"
+        # key, value, query projection
+        self.key_proj = nn.Linear(config.hidden_size, config.hidden_size)
+        self.value_proj = nn.Linear(config.hidden_size, config.hidden_size)
+        self.query_proj = nn.Linear(config.hidden_size, config.hidden_size)
+
+        # out projection
+        self.out_proj = nn.Linear(config.hidden_size, config.hidden_size)
+
+        # rel position embeddings
+        self.relative_pos_embeddings = nn.Linear(config.hidden_size, self.num_buckets * self.num_attn_heads)
+
+        # for onnx runtime
+        self.onnx_trace = False
+
+    def _reshape(self, tensor, first_dim, batch_size):
+        return tensor.reshape(first_dim, batch_size * self.num_attn_heads, self.head_dim).transpose(0, 1)
+
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+
+    def forward(
+        self,
+        hidden_states,
+        layer_state=None,
+        attention_mask=None,
+        extended_predict_attention_mask=None,
+        main_relative_position_buckets=None,
+        predict_relative_position_buckets=None,
+        position_ids=None,
+    ):
+        sequence_length, batch_size, hidden_size = hidden_states.size()
+
+        assert list(hidden_states.size()) == [
+            sequence_length,
+            batch_size,
+            hidden_size,
+        ], f"`hidden_states` should be of shape {sequence_length, batch_size, hidden_size}, but is of shape {hidden_states.shape}"
+
+        # key and value of previous time steps are cached
+        saved_state = layer_state.get("self", None)
+
+        # project
+        query_states = self.query_proj(hidden_states)
+        key_states = self.key_proj(hidden_states)
+        value_states = self.value_proj(hidden_states)
+
+        # normalize
+        query_states = query_states / (self.head_dim ** 0.5)
+
+        # reshape
+        query_states = self._reshape(query_states, sequence_length, batch_size)
+        key_states = self._reshape(key_states, -1, batch_size)
+        value_states = self._reshape(value_states, -1, batch_size)
+
+        # chunk into main stream and predict stream
+        hidden_states_list = hidden_states.chunk(1 + self.ngram, dim=0)
+
+        query_states_list = query_states.chunk(1 + self.ngram, dim=1)
+        key_states_list = key_states.chunk(1 + self.ngram, dim=1)
+        value_states_list = value_states.chunk(1 + self.ngram, dim=1)
+
+        main_hidden_states, hidden_states_predict_list = hidden_states_list[0], hidden_states_list[1:]
+        main_query_states, predict_query_states_list = query_states_list[0], query_states_list[1:]
+        main_key_states, predict_key_states_list = key_states_list[0], key_states_list[1:]
+        main_value_states, predict_value_states_list = value_states_list[0], value_states_list[1:]
+
+        # saved states are stored with shape (batch_size, num_attn_heads, seq_len, head_dim)
+        if saved_state is not None:
+            prev_main_key_states = saved_state["prev_key_states"].view(
+                batch_size * self.num_attn_heads, -1, self.head_dim
+            )
+            main_key_states = torch.cat((prev_main_key_states, main_key_states), dim=1)
+            prev_main_value_states = saved_state["prev_value_states"].view(
+                batch_size * self.num_attn_heads, -1, self.head_dim
+            )
+            main_value_states = torch.cat((prev_main_value_states, main_value_states), dim=1)
+
+        # Update cache
+        layer_state["self"] = {
+            "prev_key_states": main_key_states.view(batch_size, self.num_attn_heads, -1, self.head_dim),
+            "prev_value_states": main_value_states.view(batch_size, self.num_attn_heads, -1, self.head_dim),
+        }
+
+        # get seq_length of main stream only
+        main_sequence_length = sequence_length // (1 + self.ngram)
+
+        # MAIN-STREAM
+        # main attn weights
+        main_attn_weights = torch.bmm(main_query_states, main_key_states.transpose(1, 2))
+
+        # retrieve relative position embeddings for each layer -> see paper for more details
+        main_relative_pos_embeddings = self.get_main_relative_pos_embeddings(
+            main_hidden_states, main_attn_weights, position_ids, main_relative_position_buckets
+        )
+        main_attn_weights = main_attn_weights + main_relative_pos_embeddings
+
+        if attention_mask is not None:
+            main_attn_weights = main_attn_weights + attention_mask
+
+        main_attn_probs = softmax(
+            main_attn_weights,
+            dim=-1,
+            onnx_trace=self.onnx_trace,
+        ).type_as(main_attn_weights)
+
+        main_attn_probs = F.dropout(main_attn_probs, p=self.attention_dropout, training=self.training)
+
+        # project to attn_output
+        main_attn_output = torch.bmm(main_attn_probs, main_value_states)
+        main_attn_output = (
+            main_attn_output.transpose(0, 1).contiguous().view(1, main_sequence_length, batch_size, hidden_size)
+        )
+        main_attn_output = self.out_proj(main_attn_output)
+
+        # PREDICT-STREAM
+        # [ngram, B*head, T, c]
+        predict_query_states = torch.cat(predict_query_states_list, 0).view(
+            self.ngram, -1, main_sequence_length, self.head_dim
+        )
+        # [ngram, B*head, 2*T, c]
+        predict_key_states = torch.cat(
+            [torch.cat([main_key_states, key], 1).unsqueeze(0) for key in predict_key_states_list], 0
+        )
+
+        # [ngram, T, B, C]
+        predict_hidden_states = torch.cat(hidden_states_predict_list, 0).view(
+            self.ngram, main_sequence_length, batch_size, hidden_size
+        )
+
+        # [ngram, B*head, 2*T, c]
+        predict_value_states = torch.cat(
+            [torch.cat([main_value_states, v_p], 1).unsqueeze(0) for v_p in predict_value_states_list], 0
+        )
+        # [ngram, B*head, T, 2*T]
+        predict_attn_weights = torch.einsum("nbtc,nbsc->nbts", (predict_query_states, predict_key_states))
+
+        # [ngram, B*head, T, S]
+        # retrieve relative position embeddings for each layer -> see paper for more details
+        predict_relative_pos_embeddings = self.get_predict_relative_pos_embeddings(
+            predict_hidden_states, predict_attn_weights, position_ids, predict_relative_position_buckets
+        )
+
+        # [ngram, B*head, T, 2*T]
+        predict_attn_weights = predict_attn_weights + predict_relative_pos_embeddings
+
+        if extended_predict_attention_mask is not None:
+            predict_attn_weights = predict_attn_weights + extended_predict_attention_mask
+
+        predict_attn_probs = softmax(
+            predict_attn_weights,
+            dim=-1,
+            onnx_trace=self.onnx_trace,
+        ).type_as(predict_attn_weights)
+        predict_attn_probs = F.dropout(predict_attn_probs, p=self.attention_dropout, training=self.training)
+
+        # project to attention output
+        # [ngram, B*head, T, c]
+        predict_attn_output = torch.einsum("nbts,nbsc->nbtc", (predict_attn_probs, predict_value_states))
+        # [ngram, T, B, C]
+        predict_attn_output = (
+            predict_attn_output.transpose(1, 2)
+            .contiguous()
+            .view(self.ngram, main_sequence_length, batch_size, hidden_size)
+        )
+        predict_attn_output = self.out_proj(predict_attn_output)
+
+        # concat to single attn output
+        # [1+ngram*T, B, C]
+        attn_output = torch.cat([main_attn_output, predict_attn_output], 0).view(-1, batch_size, hidden_size)
+
+        # reshape into better form for `config.output_attentions`
+        main_attn_probs = main_attn_probs.view(batch_size, self.num_attn_heads, main_sequence_length, -1)
+        predict_attn_probs = predict_attn_probs.view(
+            self.ngram, batch_size, self.num_attn_heads, main_sequence_length, -1
+        ).transpose(0, 1)
+
+        attn_output = F.dropout(attn_output, p=self.dropout, training=self.training)
+        return attn_output, main_attn_probs, predict_attn_probs
+
+    def get_main_relative_pos_embeddings(
+        self, hidden_states, attn_weights, position_ids, main_relative_position_buckets
+    ):
+        # input hidden_states [T,B,C], input attn_weights [T*head,T,S], input position_ids [B,T] or [1,1]
+
+        if main_relative_position_buckets is None:
+            batch_size, sequence_length = hidden_states.shape[:2]
+            relative_positions = (
+                torch.arange(1, attn_weights.shape[-1] + 1)
+                .unsqueeze(0)
+                .unsqueeze(0)
+                .repeat(batch_size, sequence_length, 1)
+                .to(position_ids.device)
+            )
+            relative_positions = relative_positions - position_ids.unsqueeze(0).repeat(
+                batch_size, sequence_length, 1
+            )  # [B, T, s]
+            main_relative_position_buckets = compute_relative_buckets(
+                self.num_buckets, self.relative_max_distance, relative_positions, False
+            )
+
+        hidden_states = hidden_states.transpose(0, 1)  # [B,T,C]
+        rel_pos_embeddings = self.relative_pos_embeddings(hidden_states)  # [B,T,Buckets*head]
+        rel_pos_embeddings = rel_pos_embeddings.view(
+            rel_pos_embeddings.shape[:2] + (self.num_buckets, self.num_attn_heads)
+        ).permute(
+            0, 3, 1, 2
+        )  # [B,T,Buckets,head]
+        rel_pos_embeddings = rel_pos_embeddings.reshape(attn_weights.shape[:2] + (-1,))  # [B*head,T,Buckets]
+
+        main_relative_position_buckets = (
+            main_relative_position_buckets.repeat(1, self.num_attn_heads, 1)
+            .view(-1, main_relative_position_buckets.shape[-1])
+            .long()
+        )  # [B*head*T, T]
+        rel_pos_embeddings = rel_pos_embeddings.reshape(-1, rel_pos_embeddings.size(-1))  # [B*head*T,Buckets]
+
+        main_relative_pos_embeddings = torch.gather(
+            rel_pos_embeddings, dim=1, index=main_relative_position_buckets
+        ).view(attn_weights.shape[:2] + (-1,))
+
+        return main_relative_pos_embeddings
+
+    def get_predict_relative_pos_embeddings(
+        self, hidden_states, attn_weights, position_ids, predict_relative_position_buckets
+    ):
+        # input hidden_states [ngram, T,B,C], input attn_weights [ngram, B*head,T,S], input position_ids [B,T] or [1,1], input predict_relative_position_buckets [B,T, 2*T] or None
+
+        sequence_length, batch_size = hidden_states.shape[1:3]
+
+        if predict_relative_position_buckets is None:
+            key_sequence_length = attn_weights.shape[-1]
+            assert (
+                position_ids[0][0] == key_sequence_length - 1
+            ), "`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)"
+            relative_positions = (
+                torch.arange(0, key_sequence_length)
+                .unsqueeze(0)
+                .unsqueeze(0)
+                .repeat(batch_size, sequence_length, 1)
+                .to(position_ids.device)
+            )
+
+            relative_positions = relative_positions - position_ids.unsqueeze(0).repeat(batch_size, sequence_length, 1)
+            predict_relative_position_buckets = compute_relative_buckets(
+                self.num_buckets, self.relative_max_distance, relative_positions, False
+            )
+
+        hidden_states = hidden_states.transpose(1, 2)  # [ngram, B, T, C]
+        rel_pos_embeddings = self.relative_pos_embeddings(hidden_states).view(
+            hidden_states.shape[:-1] + (self.num_buckets, self.num_attn_heads)
+        )  # [ngram, B, T, bucket, head]
+        rel_pos_embeddings = rel_pos_embeddings.permute(0, 1, 4, 2, 3).reshape(
+            self.ngram * batch_size * self.num_attn_heads, sequence_length, -1
+        )  # [ngram*B*head, T, bucket]
+
+        predict_relative_position_buckets = predict_relative_position_buckets.unsqueeze(0).repeat(
+            self.ngram, 1, self.num_attn_heads, 1
+        )  # [ngram, B, head*T, S]
+
+        rel_pos_embeddings = rel_pos_embeddings.reshape(-1, rel_pos_embeddings.size(-1))
+        predict_relative_position_buckets = predict_relative_position_buckets.view(
+            -1, predict_relative_position_buckets.size(-1)
+        ).long()  # [ngram*B*head*T, S]
+
+        predict_relative_pos_embeddings = torch.gather(
+            rel_pos_embeddings, dim=1, index=predict_relative_position_buckets
+        ).view(
+            self.ngram, batch_size * self.num_attn_heads, sequence_length, -1
+        )  # [ngram, B*head, T, S]
+
+        return predict_relative_pos_embeddings
+
+
+class ProphetNetEncoderLayer(nn.Module):
+    """
+    Encoder block for Prophetnet
+    """
+
+    def __init__(self, config: ProphetNetConfig):
+        super().__init__()
+        # 1st residual block
+        self.self_attn = ProphetNetSelfAttention(config, config.num_encoder_attention_heads)
+        self.self_attn_layer_norm = ProphetNetLayerNorm(config.hidden_size)
+
+        # 2nd residual block
+        self.feed_forward = ProhpetNetFeedForward(config, config.encoder_ffn_dim)
+        self.feed_forward_layer_norm = ProphetNetLayerNorm(config.hidden_size)
+
+    def forward(self, hidden_states, attention_mask):
+        # 1st residual block
+        attention_output, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+        )
+        hidden_states = self.self_attn_layer_norm(attention_output + hidden_states)
+
+        # 2nd residual block
+        feed_forward_output = self.feed_forward(hidden_states)
+        hidden_states = self.feed_forward_layer_norm(feed_forward_output + hidden_states)
+        return hidden_states, attn_weights
+
+
+class ProphetNetDecoderLayer(nn.Module):
+    """
+    Decoder block for Prophetnet
+    """
+
+    def __init__(self, config: ProphetNetConfig):
+        super().__init__()
+        # 1st residual block
+        self.self_attn = ProphetNetNgramProphetNetSelfAttention(config)
+        self.self_attn_layer_norm = ProphetNetLayerNorm(config.hidden_size)
+
+        # 2nd residual block
+        if config.add_cross_attention:
+            self.cross_attn = ProphetNetSelfAttention(config, config.num_decoder_attention_heads)
+            self.cross_attn_layer_norm = ProphetNetLayerNorm(config.hidden_size)
+
+        # 3rd residual block
+        self.feed_forward = ProhpetNetFeedForward(config, config.decoder_ffn_dim)
+        self.feed_forward_layer_norm = ProphetNetLayerNorm(config.hidden_size)
+
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states=None,
+        encoder_attn_mask=None,
+        layer_state=None,
+        attention_mask=None,
+        extended_predict_attention_mask=None,
+        main_relative_position_buckets=None,
+        predict_relative_position_buckets=None,
+        position_ids=None,
+    ):
+        layer_state = layer_state if layer_state is not None else {}
+
+        # 1st residual block
+        ngram_attention_output, self_attn_weights, self_attn_weights_ngram = self.self_attn(
+            hidden_states=hidden_states,
+            layer_state=layer_state,
+            attention_mask=attention_mask,
+            extended_predict_attention_mask=extended_predict_attention_mask,
+            main_relative_position_buckets=main_relative_position_buckets,
+            predict_relative_position_buckets=predict_relative_position_buckets,
+            position_ids=position_ids,
+        )
+        hidden_states = self.self_attn_layer_norm(hidden_states + ngram_attention_output)
+
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            # 2nd residual block
+            attention_output, cross_attn_weights = self.cross_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attn_mask,
+                layer_state=layer_state,  # mutates layer state
+            )
+            hidden_states = self.cross_attn_layer_norm(attention_output + hidden_states)
+
+        # 3rd residual block
+        feed_forward_output = self.feed_forward(hidden_states)
+        hidden_states = self.feed_forward_layer_norm(feed_forward_output + hidden_states)
+
+        return (
+            hidden_states,
+            self_attn_weights,
+            self_attn_weights_ngram,
+            cross_attn_weights,
+            layer_state,
+        )  # just self_attn weights for now, following t5, layer_state = cache for decoding
+
+
+@add_start_docstrings(
+    "The standalone encoder part of the ProphetNetModel.",
+    PROPHETNET_START_DOCSTRING,
+)
+class ProphetNetEncoder(ProphetNetPreTrainedModel):
+    r"""
+    word_embeddings  (:obj:`torch.nn.Embeddings` of shape :obj:`(config.vocab_size, config.hidden_size)`, `optional`):
+        The word embedding parameters. This can be used to initialize :class:`~transformers.ProphetNetEncoder` with pre-defined
+        word embeddings instead of randomely initialized word embeddings.
+    """
+
+    def __init__(self, config: ProphetNetConfig, word_embeddings: nn.Embedding = None):
+        super().__init__(config)
+
+        self.word_embeddings = (
+            word_embeddings
+            if word_embeddings is not None
+            else nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        )
+        self.position_embeddings = ProhpetNetPositionalEmbeddings(config)
+        self.embeddings_layer_norm = ProphetNetLayerNorm(config.hidden_size)
+
+        self.layers = nn.ModuleList([ProphetNetEncoderLayer(config) for _ in range(config.num_encoder_layers)])
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.word_embeddings = value
+
+    @add_start_docstrings_to_callable(PROPHETNET_STANDALONE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
+
+        Example::
+
+            >>> from transformers import ProphetNetTokenizer, ProphetNetEncoder
+            >>> import torch
+
+            >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
+            >>> model = ProphetNetEncoder.from_pretrained('patrickvonplaten/prophetnet-large-uncased-standalone', return_dict=True)
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+
+            >>> last_hidden_states = outputs.last_hidden_state
+        """
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError("Either input_ids or inputs_embeds has to be passed.")
+        elif input_ids is not None and inputs_embeds is not None:
+            raise ValueError("Make sure to only pass input_ids or inputs_embeds.")
+        elif input_ids is not None and inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        # prepare attention mask
+        if attention_mask is not None:
+            extended_attention_mask = (
+                1.0 - attention_mask[:, None, :].repeat(self.config.num_attention_heads, 1, 1)
+            ) * -10000.0
+            extended_attention_mask = extended_attention_mask.to(inputs_embeds.dtype)
+        else:
+            extended_attention_mask = None
+
+        position_embeddings, position_ids = self.position_embeddings(inputs_embeds.shape[:2], inputs_embeds.device)
+
+        hidden_states = inputs_embeds + position_embeddings
+        hidden_states = self.embeddings_layer_norm(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.config.dropout, training=self.training)
+        hidden_states = hidden_states.transpose(0, 1)  # B x T x C -> T x B x C
+
+        encoder_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_hidden_states = encoder_hidden_states + (hidden_states.transpose(0, 1),)
+            hidden_states, attn_probs = encoder_layer(hidden_states, attention_mask=extended_attention_mask)
+            if output_attentions:
+                all_attentions = all_attentions + (attn_probs,)
+
+        hidden_states = hidden_states.transpose(0, 1)
+        if output_hidden_states:
+            encoder_hidden_states = encoder_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_hidden_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_hidden_states, attentions=all_attentions
+        )
+
+
+@add_start_docstrings(
+    "The standalone decoder part of the ProphetNetModel.",
+    PROPHETNET_START_DOCSTRING,
+)
+class ProphetNetDecoder(ProphetNetPreTrainedModel):
+    r"""
+    word_embeddings  (:obj:`torch.nn.Embeddings` of shape :obj:`(config.vocab_size, config.hidden_size)`, `optional`):
+        The word embedding parameters. This can be used to initialize :class:`~transformers.ProphetNetEncoder` with pre-defined
+        word embeddings instead of randomely initialized word embeddings.
+    """
+
+    def __init__(self, config: ProphetNetConfig, word_embeddings: nn.Embedding = None):
+        super().__init__(config)
+
+        self.ngram = config.ngram
+        self.num_buckets = config.num_buckets
+        self.relative_max_distance = config.relative_max_distance
+        self.dropout = config.dropout
+        self.max_target_positions = config.max_position_embeddings
+
+        self.word_embeddings = (
+            word_embeddings
+            if word_embeddings is not None
+            else nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        )
+        self.position_embeddings = ProhpetNetPositionalEmbeddings(config)
+
+        self.ngram_embeddings = nn.Embedding(self.ngram, config.hidden_size, None)
+        self.layers = nn.ModuleList([ProphetNetDecoderLayer(config) for _ in range(config.num_decoder_layers)])
+        self.embeddings_layer_norm = ProphetNetLayerNorm(config.hidden_size)
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.word_embeddings = value
+
+    @add_start_docstrings_to_callable(PROPHETNET_STANDALONE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ProphetNetDecoderModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+            if the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
+            is used in the cross-attention if the model is configured as a decoder.
+            Mask values selected in ``[0, 1]``:
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last
+            ``decoder_input_ids`` (those that don't have their past key value states given to this model) of shape
+            :obj:`(batch_size, 1)` instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        Returns:
+
+        Example::
+
+            >>> from transformers import ProphetNetTokenizer, ProphetNetDecoder
+            >>> import torch
+
+            >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
+            >>> model = ProphetNetDecoder.from_pretrained('patrickvonplaten/prophetnet-large-uncased-standalone', add_cross_attention=False, return_dict=True)
+            >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+
+            >>> last_hidden_states = outputs.last_hidden_state
+        """
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError("Either `decoder_input_ids` or `decoder_inputs_embeds` has to be passed.")
+        elif input_ids is not None and inputs_embeds is not None:
+            raise ValueError("Make sure to only pass `decoder_input_ids` or `decoder_inputs_embeds`.")
+        elif input_ids is not None and inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        batch_size, sequence_length = inputs_embeds.shape[:2]
+
+        main_stream_pos_embed, position_ids = self.position_embeddings(
+            (batch_size, sequence_length),
+            device=inputs_embeds.device,
+            past_key_values=past_key_values,
+        )
+
+        if past_key_values is not None:
+            main_relative_position_buckets, predict_relative_position_buckets = None, None
+        else:
+            (
+                main_relative_position_buckets,
+                predict_relative_position_buckets,
+            ) = self.compute_buffered_relative_buckets(position_ids)
+        predicting_stream_pos_embed = self.position_embeddings._forward(position_ids + 1)
+
+        # add position embeddings
+        hidden_states = inputs_embeds + main_stream_pos_embed
+        hidden_states = hidden_states.transpose(0, 1)
+
+        ngram_embeddings = self.ngram_embeddings.weight
+
+        # prepare attention mask
+        if past_key_values is not None:
+            assert (
+                hidden_states.size(0) == 1
+            ), "At the moment `use_cache` is only supported for `decoder_input_ids` of length 1"
+
+            ngram_hidden_states = [
+                (ngram_embeddings[ngram - 1] + predicting_stream_pos_embed).transpose(0, 1).repeat(1, batch_size, 1)
+                for ngram in range(self.ngram)
+            ]
+            extended_attention_mask = None
+            extended_predict_attention_mask = None
+        else:
+            ngram_hidden_states = [
+                (ngram_embeddings[ngram - 1] + predicting_stream_pos_embed).transpose(0, 1)
+                for ngram in range(self.ngram)
+            ]
+            extended_attention_mask = self.prepare_attention_mask(hidden_states, attention_mask)
+            extended_predict_attention_mask = self.prepare_predict_attention_mask(hidden_states, attention_mask)
+
+        # prepare encoder attention mask
+        if encoder_attention_mask is not None:
+            extended_encoder_attention_mask = (
+                1.0 - encoder_attention_mask[:, None, :].repeat(self.config.num_attention_heads, 1, 1)
+            ) * -10000.0
+            extended_encoder_attention_mask = extended_encoder_attention_mask.to(inputs_embeds.dtype)
+        else:
+            extended_encoder_attention_mask = None
+
+        hidden_states = torch.cat([hidden_states] + ngram_hidden_states, 0)
+
+        if self.embeddings_layer_norm:
+            hidden_states = self.embeddings_layer_norm(hidden_states)
+
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        if encoder_hidden_states is not None:
+            encoder_hidden_states = encoder_hidden_states.transpose(0, 1)
+
+        # init attentions, hidden_states and cache with empty tuples
+        all_main_stream_hidden_states = () if output_hidden_states else None
+        all_ngram_stream_hidden_states = () if output_hidden_states and self.config.ngram > 0 else None
+
+        all_main_stream_attns = () if output_attentions else None
+        all_ngram_stream_attns = () if output_attentions else None
+        all_cross_attns = () if output_attentions and self.config.add_cross_attention else None
+        present_key_values = () if use_cache else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_main_stream_hidden_states += (hidden_states[:sequence_length].transpose(0, 1),)
+                if self.config.ngram > 0:
+                    all_ngram_stream_hidden_states += (hidden_states[sequence_length:].transpose(0, 1),)
+
+            layer_state = past_key_values[idx] if past_key_values is not None else None
+            (
+                hidden_states,
+                layer_self_attn,
+                layer_self_predict_attn_output,
+                layer_cross_attn,
+                layer_past,
+            ) = decoder_layer(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attn_mask=extended_encoder_attention_mask,
+                layer_state=layer_state,
+                attention_mask=extended_attention_mask,
+                extended_predict_attention_mask=extended_predict_attention_mask,
+                main_relative_position_buckets=main_relative_position_buckets,
+                predict_relative_position_buckets=predict_relative_position_buckets,
+                position_ids=position_ids,
+            )
+            if use_cache:
+                present_key_values += (layer_past,)
+
+            if output_attentions:
+                all_main_stream_attns += (layer_self_attn,)
+                all_ngram_stream_attns += (layer_self_predict_attn_output,)
+
+                if self.config.add_cross_attention:
+                    all_cross_attns += (layer_cross_attn,)
+
+        if output_hidden_states:
+            all_main_stream_hidden_states += (hidden_states[:sequence_length].transpose(0, 1),)
+            if self.config.ngram > 0:
+                all_ngram_stream_hidden_states += (hidden_states[sequence_length:].transpose(0, 1),)
+
+        # split last_hidden_state for return
+        last_hidden_state = hidden_states[:sequence_length].transpose(0, 1)
+        last_hidden_state_ngram = hidden_states[sequence_length:].transpose(0, 1) if self.config.ngram > 0 else None
+        encoder_hidden_states = encoder_hidden_states.transpose(0, 1) if encoder_hidden_states is not None else None
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    last_hidden_state,
+                    last_hidden_state_ngram,
+                    present_key_values,
+                    all_main_stream_hidden_states,
+                    all_ngram_stream_hidden_states,
+                    all_main_stream_attns,
+                    all_ngram_stream_attns,
+                    all_cross_attns,
+                ]
+                if v is not None
+            )
+        return ProphetNetDecoderModelOutput(
+            last_hidden_state=last_hidden_state,
+            last_hidden_state_ngram=last_hidden_state_ngram,
+            past_key_values=present_key_values,
+            hidden_states=all_main_stream_hidden_states,
+            hidden_states_ngram=all_ngram_stream_hidden_states,
+            attentions=all_main_stream_attns,
+            ngram_attentions=all_ngram_stream_attns,
+            cross_attentions=all_cross_attns,
+        )
+
+    def compute_buffered_relative_buckets(self, position_ids):
+        batch_size, sequence_length = position_ids.shape
+
+        position_ids = torch.arange(1, self.max_target_positions).to(position_ids.device).repeat(1, 1)
+        main_relative_buckets, predict_relative_buckets = compute_all_stream_relative_buckets(
+            self.num_buckets, self.relative_max_distance, position_ids
+        )
+
+        # buffer relative buckets
+        main_relative_buckets = main_relative_buckets[:, :sequence_length, :sequence_length].repeat(batch_size, 1, 1)
+        predict_relative_buckets = torch.cat(
+            [
+                predict_relative_buckets[:, :sequence_length, :sequence_length],
+                predict_relative_buckets[
+                    :, :sequence_length, self.max_target_positions : self.max_target_positions + sequence_length
+                ],
+            ],
+            2,
+        ).repeat(batch_size, 1, 1)
+
+        return main_relative_buckets, predict_relative_buckets
+
+    def prepare_attention_mask(self, hidden_states, attention_mask):
+        seq_length, batch_size = hidden_states.shape[:2]
+
+        # get causal mask
+        causal_mask = hidden_states.new(seq_length, seq_length).float().fill_(-float("inf"))
+        causal_mask = torch.triu(causal_mask, 1)
+        extended_causal_mask = causal_mask[:seq_length, :seq_length][None, :, :].expand(
+            (batch_size,) + causal_mask.shape
+        )
+
+        # add usual attention mask
+        if attention_mask is not None:
+            extended_attention_mask = (1.0 - attention_mask[:, None, :]) * -10000.0
+            extended_attention_mask = extended_causal_mask + extended_attention_mask
+        else:
+            extended_attention_mask = extended_causal_mask
+        return extended_attention_mask.repeat(self.config.num_decoder_attention_heads, 1, 1).to(hidden_states.dtype)
+
+    def prepare_predict_attention_mask(self, hidden_states, attention_mask):
+        seq_length, batch_size = hidden_states.shape[:2]
+
+        # get causal mask
+        predict_causal_mask = ngram_attention_bias(
+            self.max_target_positions, self.ngram, hidden_states.device, hidden_states.dtype
+        )
+        predict_causal_mask = torch.cat(
+            [
+                predict_causal_mask[:, :seq_length, :seq_length],
+                predict_causal_mask[
+                    :, :seq_length, self.max_target_positions : self.max_target_positions + seq_length
+                ],
+            ],
+            dim=-1,
+        )
+        extended_predict_causal_mask = predict_causal_mask[:, None, :, :].expand(
+            predict_causal_mask.shape[:1] + (batch_size,) + predict_causal_mask.shape[1:]
+        )
+
+        # add usual attention mask
+        if attention_mask is not None:
+            extended_attention_mask = (1.0 - attention_mask[None, :, None, :]) * -10000.0
+            extended_attention_mask = extended_attention_mask.expand((self.ngram, batch_size, seq_length, seq_length))
+            # predicted stream attention_mask should always be 0
+            extended_attention_mask = torch.cat(
+                [extended_attention_mask, torch.zeros_like(extended_attention_mask)], dim=-1
+            )
+            extended_predict_attention_mask = extended_predict_causal_mask + extended_attention_mask
+        else:
+            extended_predict_attention_mask = extended_predict_causal_mask
+        return extended_predict_attention_mask.repeat(1, self.config.num_decoder_attention_heads, 1, 1).to(
+            hidden_states.dtype
+        )
+
+
+@add_start_docstrings(
+    "The bare ProphetNet Model outputting raw hidden-states without any specific head on top.",
+    PROPHETNET_START_DOCSTRING,
+)
+class ProphetNetModel(ProphetNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_encoder_decoder = False
+        encoder_config.use_cache = False
+        self.encoder = ProphetNetEncoder(encoder_config, self.word_embeddings)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        self.decoder = ProphetNetDecoder(decoder_config, self.word_embeddings)
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.word_embeddings = value
+        self.encoder.word_embeddings = self.word_embeddings
+        self.decoder.word_embeddings = self.word_embeddings
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_callable(PROPHETNET_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ProphetNetSeq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs: Optional[Tuple] = None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
+
+        Example::
+
+            >>> from transformers import ProphetNetTokenizer, ProphetNetModel
+
+            >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
+            >>> model = ProphetNetModel.from_pretrained('microsoft/prophetnet-large-uncased')
+
+            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+            >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+            >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids, return_dict=True)
+
+            >>> last_hidden_states = outputs.last_hidden_state  # main stream hidden states
+            >>> last_hidden_states_ngram = outputs.last_hidden_state_ngram  # predict hidden states
+        """
+
+        use_cache == use_cache if use_cache is not None else self.config.use_cache
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            use_cache=use_cache,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+        return ProphetNetSeq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            last_hidden_state_ngram=decoder_outputs.last_hidden_state_ngram,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_ngram_hidden_states=decoder_outputs.hidden_states_ngram,
+            decoder_attentions=decoder_outputs.attentions,
+            decoder_ngram_attentions=decoder_outputs.ngram_attentions,
+            decoder_cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    "The ProphetNet Model with a language modeling head. Can be used for sequence generation tasks.",
+    PROPHETNET_START_DOCSTRING,
+)
+class ProphetNetForConditionalGeneration(ProphetNetPreTrainedModel):
+    def __init__(self, config: ProphetNetConfig):
+        super().__init__(config)
+        self.prophetnet = ProphetNetModel(config)
+        self.padding_idx = config.pad_token_id
+        self.disable_ngram_loss = config.disable_ngram_loss
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def get_input_embeddings(self):
+        return self.prophetnet.word_embeddings
+
+    @add_start_docstrings_to_callable(PROPHETNET_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ProphetNetSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in :obj:`[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to ``-100`` are ignored (masked), the loss is only
+            computed for labels in ``[0, ..., config.vocab_size]``
+
+        Returns:
+
+        Example::
+
+            >>> from transformers import ProphetNetTokenizer, ProphetNetForConditionalGeneration
+
+            >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
+            >>> model = ProphetNetForConditionalGeneration.from_pretrained('microsoft/prophetnet-large-uncased')
+
+            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+            >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+            >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids, return_dict=True)
+
+            >>> logits_next_token = outputs.logits  # logits to predict next token as usual
+            >>> logits_ngram_next_tokens = outputs.logits_ngram  # logits to predict 2nd, 3rd, ... next tokens
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+            # get decoder inputs from shifting lm labels to the right
+            decoder_input_ids = self._shift_right(labels)
+
+        outputs = self.prophetnet(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        batch_size, sequence_length = (
+            decoder_input_ids.shape if decoder_input_ids is not None else decoder_inputs_embeds.shape[:2]
+        )
+
+        predicting_streams = outputs[1].view(batch_size, self.config.ngram, sequence_length, -1)
+        predict_logits = self.lm_head(predicting_streams)
+
+        logits = predict_logits[:, 0]
+        logits_ngram = predict_logits[:, 1:] if self.config.ngram > 1 else None
+
+        loss = None
+        if labels is not None:
+            loss = self._compute_loss(predict_logits, labels)
+
+        if not return_dict:
+            all_logits = tuple(v for v in [logits, logits_ngram] if v is not None)
+            return (loss,) + all_logits + outputs[2:] if loss is not None else all_logits + outputs[2:]
+        else:
+            return ProphetNetSeq2SeqLMOutput(
+                loss=loss,
+                logits=logits,
+                logits_ngram=logits_ngram,
+                past_key_values=outputs.past_key_values,
+                decoder_hidden_states=outputs.decoder_hidden_states,
+                decoder_ngram_hidden_states=outputs.decoder_ngram_hidden_states,
+                decoder_attentions=outputs.decoder_attentions,
+                decoder_ngram_attentions=outputs.decoder_ngram_attentions,
+                decoder_cross_attentions=outputs.decoder_cross_attentions,
+                encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+                encoder_hidden_states=outputs.encoder_hidden_states,
+                encoder_attentions=outputs.encoder_attentions,
+            )
+
+    def _compute_loss(self, logits, labels):
+        expend_targets = labels.new_zeros(self.config.ngram, labels.size(0), labels.size(1)).fill_(self.padding_idx)
+
+        for i in range(self.config.ngram):
+            if i > 0 and self.disable_ngram_loss:
+                break
+            expend_targets[i, :, :] = labels
+
+        lprobs = F.log_softmax(
+            logits.view(-1, logits.size(-1)),
+            dim=-1,
+            dtype=torch.float32,
+        )
+
+        loss = F.nll_loss(lprobs, expend_targets.view(-1), reduction="sum")
+
+        if self.config.eps > 0.0:
+            smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
+            non_pad_mask = expend_targets.ne(self.padding_idx).view(-1)
+            smooth_loss = smooth_loss[non_pad_mask]
+            smooth_loss = smooth_loss.sum()
+
+            eps_i = self.config.eps / lprobs.size(-1)
+            loss = (1.0 - self.config.eps) * loss + eps_i * smooth_loss
+
+        return loss
+
+    def prepare_inputs_for_generation(
+        self, decoder_input_ids, past, attention_mask, use_cache, encoder_outputs, **kwargs
+    ):
+        assert encoder_outputs is not None, "`encoder_outputs` have to be passed for generation."
+
+        if past:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+        # first step, decoder_cached_states are empty
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,
+        }
+
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        # this function reorders the cache for beam search
+        def _reorder_cache(cache_dict, beam_idx):
+            for k, key_value_states in cache_dict.items():
+                if key_value_states is not None:
+                    cache_dict[k] = key_value_states.index_select(0, beam_idx)
+            return cache_dict
+
+        reordered_past = []
+        for layer_past in past:
+            # get the correct batch idx from decoder layer's batch dim for cross and self-attn
+            layer_past_new = {
+                attn_key: _reorder_cache(attn_cache, beam_idx) for attn_key, attn_cache in layer_past.items()
+            }
+            reordered_past.append(layer_past_new)
+        return reordered_past
+
+    def get_encoder(self):
+        return self.prophetnet.encoder
+
+    def get_decoder(self):
+        return self.prophetnet.decoder
+
+
+@add_start_docstrings(
+    "The standalone decoder part of the ProphetNetModel with a lm head on top. The model can be used for causal language modeling.",
+    PROPHETNET_START_DOCSTRING,
+)
+class ProphetNetForCausalLM(ProphetNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        # set config for CLM
+        config = copy.deepcopy(config)
+        config.is_decoder = True
+        config.is_encoder_decoder = False
+        self.decoder = ProphetNetDecoder(config)
+
+        self.padding_idx = config.pad_token_id
+        self.disable_ngram_loss = config.disable_ngram_loss
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.decoder.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.decoder.word_embeddings = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    @add_start_docstrings_to_callable(PROPHETNET_STANDALONE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ProphetNetDecoderLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+            if the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
+            is used in the cross-attention if the model is configured as a decoder.
+            Mask values selected in ``[0, 1]``:
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last
+            ``decoder_input_ids`` (those that don't have their past key value states given to this model) of shape
+            :obj:`(batch_size, 1)` instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction).
+            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+            n ``[0, ..., config.vocab_size]``
+
+        Returns:
+
+        Example::
+
+            >>> from transformers import ProphetNetTokenizer, ProphetNetForCausalLM
+            >>> import torch
+
+            >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
+            >>> model = ProphetNetForCausalLM.from_pretrained('patrickvonplaten/prophetnet-decoder-clm-large-uncased', return_dict=True)
+            >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs, return_dict=True)
+
+            >>> logits = outputs.logits
+
+            >>> # Model can also be used with EncoderDecoder framework
+            >>> from transformers import BertTokenizer, EncoderDecoderModel
+            >>> import torch
+
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-uncased-large')
+            >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-uncased-large", "patrickvonplaten/prophetnet-decoder-clm-large-uncased")
+
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(input_ids=inputs["input_ids"], labels=inputs["input_ids"], return_dict=True)
+
+            >>> loss = outputs.loss
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        batch_size, sequence_length = input_ids.shape if input_ids is not None else inputs_embeds.shape[:2]
+
+        predicting_streams = outputs[1].view(batch_size, self.config.ngram, sequence_length, -1)
+        predict_logits = self.lm_head(predicting_streams)
+
+        logits = predict_logits[:, 0]
+        logits_ngram = predict_logits[:, 1:] if self.config.ngram > 1 else None
+
+        loss = None
+        if labels is not None:
+            loss = self._compute_loss(predict_logits, labels)
+
+        if not return_dict:
+            all_logits = tuple(v for v in [logits, logits_ngram] if v is not None)
+            return (loss,) + all_logits + outputs[2:] if loss is not None else all_logits + outputs[2:]
+        else:
+            return ProphetNetDecoderLMOutput(
+                loss=loss,
+                logits=logits,
+                logits_ngram=logits_ngram,
+                past_key_values=outputs.past_key_values,
+                hidden_states=outputs.hidden_states,
+                hidden_states_ngram=outputs.hidden_states_ngram,
+                attentions=outputs.attentions,
+                ngram_attentions=outputs.ngram_attentions,
+            )
+
+    def _compute_loss(self, logits, labels):
+        expend_targets = labels.new_zeros(self.config.ngram, labels.size(0), labels.size(1)).fill_(self.padding_idx)
+
+        for i in range(self.config.ngram):
+            if i > 0 and self.disable_ngram_loss:
+                break
+            expend_targets[i, :, :] = labels
+
+        lprobs = F.log_softmax(
+            logits.view(-1, logits.size(-1)),
+            dim=-1,
+            dtype=torch.float32,
+        )
+
+        loss = F.nll_loss(lprobs, expend_targets.view(-1), reduction="sum")
+
+        if self.config.eps > 0.0:
+            smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
+            non_pad_mask = expend_targets.ne(self.padding_idx).view(-1)
+            smooth_loss = smooth_loss[non_pad_mask]
+            smooth_loss = smooth_loss.sum()
+
+            eps_i = self.config.eps / lprobs.size(-1)
+            loss = (1.0 - self.config.eps) * loss + eps_i * smooth_loss
+
+        return loss
+
+    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, use_cache=None, **kwargs):
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_ids.shape)
+
+        if past:
+            input_ids = input_ids[:, -1:]
+        # first step, decoder_cached_states are empty
+        return {
+            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
+            "attention_mask": attention_mask,
+            "past_key_values": past,
+            "use_cache": use_cache,
+        }
+
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        # this function reorders the cache for beam search
+        def _reorder_cache(cache_dict, beam_idx):
+            for k, key_value_states in cache_dict.items():
+                if key_value_states is not None:
+                    cache_dict[k] = key_value_states.index_select(0, beam_idx)
+            return cache_dict
+
+        reordered_past = []
+        for layer_past in past:
+            # get the correct batch idx from decoder layer's batch dim for cross and self-attn
+            layer_past_new = {
+                attn_key: _reorder_cache(attn_cache, beam_idx) for attn_key, attn_cache in layer_past.items()
+            }
+            reordered_past.append(layer_past_new)
+        return reordered_past
+
+    def set_decoder(self, decoder):
+        self.decoder = decoder
+
+    def get_decoder(self):
+        return self.decoder
diff --git a/src/transformers/modeling_tf_lxmert.py b/src/transformers/modeling_tf_lxmert.py
index eeb576d3b8..3501452249 100644
--- a/src/transformers/modeling_tf_lxmert.py
+++ b/src/transformers/modeling_tf_lxmert.py
@@ -17,7 +17,6 @@
 """ TF 2.0 LXMERT model. """
 
 
-import logging
 from dataclasses import dataclass
 from typing import Dict, Optional, Tuple
 
@@ -34,9 +33,10 @@ from .file_utils import (
 )
 from .modeling_tf_utils import TFPreTrainedModel, get_initializer, keras_serializable, shape_list
 from .tokenization_utils_base import BatchEncoding
+from .utils import logging
 
 
-logger = logging.getLogger(__name__)
+logger = logging.get_logger(__name__)
 
 
 _CONFIG_FOR_DOC = "LxmertConfig"
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 3b6c4ce123..04a65ae90a 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -490,12 +490,17 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
             self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
 
         if self.config.is_encoder_decoder and self.config.tie_encoder_decoder:
+            if hasattr(self, self.base_model_prefix):
+                self = getattr(self, self.base_model_prefix)
             self._tie_encoder_decoder_weights(self.encoder, self.decoder, self.base_model_prefix)
 
     @staticmethod
     def _tie_encoder_decoder_weights(encoder: nn.Module, decoder: nn.Module, base_model_prefix: str):
         uninitialized_encoder_weights: List[str] = []
-        assert decoder.__class__ == encoder.__class__, f"{decoder.__class__} and {encoder.__class__} have to be equal."
+        if decoder.__class__ != encoder.__class__:
+            logger.info(
+                f"{decoder.__class__} and {encoder.__class__} are not equal. In this case make sure that all encoder weights are correctly initialized."
+            )
 
         def tie_encoder_to_decoder_recursively(
             decoder_pointer: nn.Module,
@@ -528,7 +533,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
                     if name.isdigit():
                         encoder_name = str(int(name) + encoder_layer_pos)
                         decoder_name = name
-                        if not isinstance(decoder_modules[decoder_name], type(encoder_modules[encoder_name])):
+                        if not isinstance(decoder_modules[decoder_name], type(encoder_modules[encoder_name])) and len(
+                            encoder_modules
+                        ) != len(decoder_modules):
                             # this can happen if the name corresponds to the position in a list module list of layers
                             # in this case the decoder has added a cross-attention that the encoder does not have
                             # thus skip this step and substract one layer pos from encoder
diff --git a/src/transformers/modeling_xlm_prophetnet.py b/src/transformers/modeling_xlm_prophetnet.py
new file mode 100644
index 0000000000..d8db0d79e7
--- /dev/null
+++ b/src/transformers/modeling_xlm_prophetnet.py
@@ -0,0 +1,159 @@
+# coding=utf-8
+# Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch XLM-ProphetNet model."""
+
+from .configuration_xlm_prophetnet import XLMProphetNetConfig
+from .modeling_prophetnet import (
+    ProphetNetDecoder,
+    ProphetNetEncoder,
+    ProphetNetForCausalLM,
+    ProphetNetForConditionalGeneration,
+    ProphetNetModel,
+)
+from .utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+_TOKENIZER_FOR_DOC = "XLMProphetNetTokenizer"
+
+XLM_PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "microsoft/xprophetnet-large-wiki100-cased",
+    # See all ProphetNet models at https://huggingface.co/models?filter=xprophetnet
+]
+
+
+class XLMProphetNetEncoder(ProphetNetEncoder):
+    r"""
+    This class overrides :class:`~transformers.ProphetNetEncoder`. Please check the
+    superclass for the appropriate documentation alongside usage examples.
+
+    Example::
+
+        >>> from transformers import XLMProphetNetTokenizer, XLMProphetNetEncoder
+        >>> import torch
+
+        >>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
+        >>> model = XLMProphetNetEncoder.from_pretrained('patrickvonplaten/xprophetnet-large-uncased-standalone', return_dict=True)
+        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> last_hidden_states = outputs.last_hidden_state
+    """
+
+    config_class = XLMProphetNetConfig
+
+
+class XLMProphetNetDecoder(ProphetNetDecoder):
+    r"""
+    This class overrides :class:`~transformers.ProphetNetDecoder`. Please check the
+    superclass for the appropriate documentation alongside usage examples.
+
+    Example::
+
+        >>> from transformers import XLMProphetNetTokenizer, XLMProphetNetDecoder
+        >>> import torch
+
+        >>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
+        >>> model = XLMProphetNetDecoder.from_pretrained('patrickvonplaten/xprophetnet-large-uncased-standalone', add_cross_attention=False, return_dict=True)
+        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> last_hidden_states = outputs.last_hidden_state
+    """
+
+    config_class = XLMProphetNetConfig
+
+
+class XLMProphetNetModel(ProphetNetModel):
+    r"""
+    This class overrides :class:`~transformers.ProphetNetModel`. Please check the
+    superclass for the appropriate documentation alongside usage examples.
+
+    Example::
+
+        >>> from transformers import XLMProphetNetTokenizer, XLMProphetNetModel
+
+        >>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
+        >>> model = XLMProphetNetModel.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
+
+        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids, return_dict=True)
+
+        >>> last_hidden_states = outputs.last_hidden_state  # main stream hidden states
+            >>> last_hidden_states_ngram = outputs.last_hidden_state_ngram  # predict hidden states
+    """
+
+    config_class = XLMProphetNetConfig
+
+
+class XLMProphetNetForConditionalGeneration(ProphetNetForConditionalGeneration):
+    r"""
+    This class overrides :class:`~transformers.ProphetNetForConditionalGeneration`. Please check the
+    superclass for the appropriate documentation alongside usage examples.
+
+    Example::
+
+        >>> from transformers import XLMProphetNetTokenizer, XLMProphetNetForConditionalGeneration
+
+        >>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
+        >>> model =  XLMProphetNetForConditionalGeneration.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
+
+        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids, return_dict=True)
+
+        >>> logits_next_token = outputs.logits  # logits to predict next token as usual
+        >>> logits_ngram_next_tokens = outputs.logits_ngram  # logits to predict 2nd, 3rd, ... next tokens
+    """
+
+    config_class = XLMProphetNetConfig
+
+
+class XLMProphetNetForCausalLM(ProphetNetForCausalLM):
+    r"""
+    This class overrides :class:`~transformers.ProphetNetForCausalLM`. Please check the
+    superclass for the appropriate documentation alongside usage examples.
+
+    Example::
+
+        >>> from transformers import XLMProphetNetTokenizer, XLMProphetNetForCausalLM
+        >>> import torch
+
+        >>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
+        >>> model = XLMProphetNetForCausalLM.from_pretrained('patrickvonplaten/xprophetnet-decoder-clm-large-uncased', return_dict=True)
+        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> logits = outputs.logits
+
+        >>> # Model can also be used with EncoderDecoder framework
+        >>> from transformers import BertTokenizer, EncoderDecoderModel
+        >>> import torch
+
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-uncased-large')
+        >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-uncased-large", "patrickvonplaten/xprophetnet-decoder-clm-large-uncased")
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(input_ids=inputs["input_ids"], labels=inputs["input_ids"])
+
+        >>> loss = outputs.loss
+    """
+
+    config_class = XLMProphetNetConfig
diff --git a/src/transformers/tokenization_auto.py b/src/transformers/tokenization_auto.py
index 018deb90b9..24d391b596 100644
--- a/src/transformers/tokenization_auto.py
+++ b/src/transformers/tokenization_auto.py
@@ -43,6 +43,7 @@ from .configuration_auto import (
     MobileBertConfig,
     OpenAIGPTConfig,
     PegasusConfig,
+    ProphetNetConfig,
     RagConfig,
     ReformerConfig,
     RetriBertConfig,
@@ -51,6 +52,7 @@ from .configuration_auto import (
     T5Config,
     TransfoXLConfig,
     XLMConfig,
+    XLMProphetNetConfig,
     XLMRobertaConfig,
     XLNetConfig,
     replace_list_option_in_docstrings,
@@ -77,6 +79,7 @@ from .tokenization_lxmert import LxmertTokenizer
 from .tokenization_mobilebert import MobileBertTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_phobert import PhobertTokenizer
+from .tokenization_prophetnet import ProphetNetTokenizer
 from .tokenization_rag import RagTokenizer
 from .tokenization_retribert import RetriBertTokenizer
 from .tokenization_roberta import RobertaTokenizer
@@ -95,6 +98,7 @@ if is_sentencepiece_available():
     from .tokenization_pegasus import PegasusTokenizer
     from .tokenization_reformer import ReformerTokenizer
     from .tokenization_t5 import T5Tokenizer
+    from .tokenization_xlm_prophetnet import XLMProphetNetTokenizer
     from .tokenization_xlm_roberta import XLMRobertaTokenizer
     from .tokenization_xlnet import XLNetTokenizer
 else:
@@ -199,6 +203,8 @@ TOKENIZER_MAPPING = OrderedDict(
         (BertGenerationConfig, (BertGenerationTokenizer, None)),
         (DebertaConfig, (DebertaTokenizer, None)),
         (RagConfig, (RagTokenizer, None)),
+        (XLMProphetNetConfig, (XLMProphetNetTokenizer, None)),
+        (ProphetNetConfig, (ProphetNetTokenizer, None)),
     ]
 )
 
diff --git a/src/transformers/tokenization_deberta.py b/src/transformers/tokenization_deberta.py
index a3a9edd168..130b7da4ba 100644
--- a/src/transformers/tokenization_deberta.py
+++ b/src/transformers/tokenization_deberta.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ Tokenization class for model DeBERTa."""
 
-import logging
 import os
 import pathlib
 import random
@@ -28,6 +27,7 @@ import tqdm
 import requests
 
 from .tokenization_utils import PreTrainedTokenizer
+from .utils import logging
 
 
 try:
@@ -36,7 +36,7 @@ except ImportError:
     raise ImportError("Please install regex with: pip install regex")
 
 
-logger = logging.getLogger(__name__)
+logger = logging.get_logger(__name__)
 
 VOCAB_FILES_NAMES = {"vocab_file": "bpe_encoder.bin"}
 
diff --git a/src/transformers/tokenization_fsmt.py b/src/transformers/tokenization_fsmt.py
index 9d94fd2ff0..7efbb7df6a 100644
--- a/src/transformers/tokenization_fsmt.py
+++ b/src/transformers/tokenization_fsmt.py
@@ -16,7 +16,6 @@
 
 
 import json
-import logging
 import os
 import re
 import unicodedata
@@ -27,9 +26,10 @@ import sacremoses as sm
 from .file_utils import add_start_docstrings
 from .tokenization_utils import BatchEncoding, PreTrainedTokenizer
 from .tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING
+from .utils import logging
 
 
-logger = logging.getLogger(__name__)
+logger = logging.get_logger(__name__)
 
 VOCAB_FILES_NAMES = {
     "src_vocab_file": "vocab-src.json",
diff --git a/src/transformers/tokenization_prophetnet.py b/src/transformers/tokenization_prophetnet.py
new file mode 100644
index 0000000000..ca6288a2ed
--- /dev/null
+++ b/src/transformers/tokenization_prophetnet.py
@@ -0,0 +1,284 @@
+# coding=utf-8
+# Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+from typing import List, Optional, Tuple
+
+from .tokenization_bert import BasicTokenizer, WordpieceTokenizer
+from .tokenization_utils import PreTrainedTokenizer
+from .utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "prophetnet.tokenizer"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "microsoft/prophetnet-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/microsoft/prophetnet-large-uncased/prophetnet.tokenizer",
+    }
+}
+
+PRETRAINED_INIT_CONFIGURATION = {
+    "microsoft/prophetnet-large-uncased": {"do_lower_case": True},
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "microsoft/prophetnet-large-uncased": 512,
+}
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+
+
+class ProphetNetTokenizer(PreTrainedTokenizer):
+    r"""
+    Construct a ProphetNetTokenizer. Based on WordPiece.
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            File containing the vocabulary.
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to lowercase the input when tokenizing.
+        do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to do basic tokenization before WordPiece.
+        never_split (:obj:`Iterable`, `optional`):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            :obj:`do_basic_tokenize=True`
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        x_sep_token (:obj:`str`, `optional`, defaults to :obj:`"[X_SEP]"`):
+            Special second separator token, which can be generated by :class:`~transformers.ProphetNetForConditionalGeneration`.
+            It is used to separate bullet-point like sentences in summarization, *e.g.*.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this `issue
+            <https://github.com/huggingface/transformers/issues/328>`__).
+        strip_accents: (:obj:`bool`, `optional`):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for :obj:`lowercase` (as in the original BERT).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        x_sep_token="[X_SEP]",
+        pad_token="[PAD]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs
+    ):
+        super().__init__(
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            x_sep_token=x_sep_token,
+            **kwargs,
+        )
+        self.unique_no_split_tokens.append(x_sep_token)
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = ProphetNetTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case,
+                never_split=never_split,
+                tokenize_chinese_chars=tokenize_chinese_chars,
+                strip_accents=strip_accents,
+            )
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+
+    def _tokenize(self, text):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
+
+                # If the token is part of the never_split set
+                if token in self.basic_tokenizer.never_split:
+                    split_tokens.append(token)
+                else:
+                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is None:
+            return ([0] * len(token_ids_0)) + [1]
+        return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        A ProphetNet sequence pair mask has the following format:
+
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        if token_ids_1 is None:
+            return len(token_ids_0 + sep) * [0]
+        return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+            )
+        else:
+            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!".format(vocab_file)
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        A BERT sequence has the following format:
+
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return token_ids_0 + [self.sep_token_id]
+        sep = [self.sep_token_id]
+        return token_ids_0 + sep + token_ids_1 + sep
diff --git a/src/transformers/tokenization_xlm_prophetnet.py b/src/transformers/tokenization_xlm_prophetnet.py
new file mode 100644
index 0000000000..a7e5d2ffcb
--- /dev/null
+++ b/src/transformers/tokenization_xlm_prophetnet.py
@@ -0,0 +1,307 @@
+# coding=utf-8
+# Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+from shutil import copyfile
+from typing import List, Optional, Tuple
+
+from .tokenization_utils import PreTrainedTokenizer
+from .utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+SPIECE_UNDERLINE = "▁"
+
+VOCAB_FILES_NAMES = {"vocab_file": "prophetnet.tokenizer"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "microsoft/xprophetnet-large-wiki100-cased": "https://cdn.huggingface.co/microsoft/xprophetnet-large-wiki100-cased/prophetnet.tokenizer",
+    }
+}
+
+PRETRAINED_INIT_CONFIGURATION = {
+    "microsoft/xprophetnet-large-wiki100-cased": {"do_lower_case": False},
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "microsoft/xprophetnet-large-wiki100-cased": 512,
+}
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+
+
+class XLMProphetNetTokenizer(PreTrainedTokenizer):
+    """
+    Adapted from :class:`~transfomers.RobertaTokenizer` and class:`~transfomers.XLNetTokenizer`. Based on
+    `SentencePiece <https://github.com/google/sentencepiece>`__.
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            Path to the vocabulary file.
+        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the beginning
+                of sequence. The token used is the :obj:`cls_token`.
+        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            The end of sequence token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the end
+                of sequence. The token used is the :obj:`sep_token`.
+        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
+            Additional special tokens used by the tokenizer.
+
+    Attributes:
+        sp_model (:obj:`SentencePieceProcessor`):
+            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="[SEP]",
+        eos_token="[SEP]",
+        sep_token="[SEP]",
+        unk_token="[UNK]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+
+        try:
+            import sentencepiece as spm
+        except ImportError:
+            logger.warning(
+                "You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece"
+                "pip install sentencepiece"
+            )
+            raise
+
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(str(vocab_file))
+        self.vocab_file = vocab_file
+
+        # Original fairseq vocab and spm vocab must be "aligned":
+        # Vocab    |    0    |    1    |   2    |    3    |  4  |  5  |  6  |   7   |   8   |  9
+        # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
+        # fairseq  | '<s>'   | '<pad>' | '</s>' | '<unk>' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
+        # spm      | '<unk>' | '<s>'   | '</s>' | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'
+
+        # put special tokens and [unused] tokens into the vocab
+        self.fairseq_tokens_to_ids = {"[PAD]": 0, "[CLS]": 1, "[SEP]": 2, "[UNK]": 3, "[MASK]": 4}
+
+        for i in range(10):
+            tok = "[unused{}]".format(i)
+            self.fairseq_tokens_to_ids[tok] = 5 + i
+
+        # The first "real" token "," has position 15 in the embedding vocab and position 3 in the spm vocab
+        self.fairseq_offset = 12
+        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+        for k in self.fairseq_tokens_to_ids.keys():
+            self.unique_no_split_tokens.append(k)
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+        try:
+            import sentencepiece as spm
+        except ImportError:
+            logger.warning(
+                "You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece"
+                "pip install sentencepiece"
+            )
+            raise
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(self.vocab_file)
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is None:
+            return ([0] * len(token_ids_0)) + [1]
+        return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        XLMProphetNet does not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of zeros.
+
+        """
+
+        sep = [self.sep_token_id]
+
+        if token_ids_1 is None:
+            return len(token_ids_0 + sep) * [0]
+        return len(token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    @property
+    def vocab_size(self):
+        return len(self.sp_model) + self.fairseq_offset
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text):
+        return self.sp_model.EncodeAsPieces(text)
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        if token in self.fairseq_tokens_to_ids:
+            return self.fairseq_tokens_to_ids[token]
+        spm_id = self.sp_model.PieceToId(token)
+
+        # Need to return unknown token if the SP model returned 0
+        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.fairseq_ids_to_tokens:
+            return self.fairseq_ids_to_tokens[index]
+        return self.sp_model.IdToPiece(index - self.fairseq_offset)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        A XLMProphetNet sequence has the following format:
+
+        - single sequence: ``X [SEP]``
+        - pair of sequences: ``A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return token_ids_0 + [self.sep_token_id]
+        sep = [self.sep_token_id]
+        return token_ids_0 + sep + token_ids_1 + sep
diff --git a/src/transformers/tokenization_xlm_roberta.py b/src/transformers/tokenization_xlm_roberta.py
index 44052e8185..0fa07d289f 100644
--- a/src/transformers/tokenization_xlm_roberta.py
+++ b/src/transformers/tokenization_xlm_roberta.py
@@ -22,12 +22,13 @@ from typing import List, Optional, Tuple
 import sentencepiece as spm
 
 from .tokenization_utils import PreTrainedTokenizer
-from .tokenization_xlnet import SPIECE_UNDERLINE
 from .utils import logging
 
 
 logger = logging.get_logger(__name__)
 
+SPIECE_UNDERLINE = "▁"
+
 VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 0b8e7662f3..3e152be240 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1305,6 +1305,51 @@ class PegasusForConditionalGeneration:
         requires_pytorch(self)
 
 
+PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class ProphetNetDecoder:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class ProphetNetEncoder:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class ProphetNetForCausalLM:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class ProphetNetForConditionalGeneration:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class ProphetNetModel:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class ProphetNetPreTrainedModel:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
 class RagModel:
     def __init__(self, *args, **kwargs):
         requires_pytorch(self)
@@ -1706,6 +1751,42 @@ class XLMWithLMHeadModel:
         requires_pytorch(self)
 
 
+XLM_PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class XLMProphetNetDecoder:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class XLMProphetNetEncoder:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class XLMProphetNetForCausalLM:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class XLMProphetNetForConditionalGeneration:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class XLMProphetNetModel:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
 XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/src/transformers/utils/dummy_sentencepiece_objects.py b/src/transformers/utils/dummy_sentencepiece_objects.py
index 46fdb3a46d..d0c9b64aac 100644
--- a/src/transformers/utils/dummy_sentencepiece_objects.py
+++ b/src/transformers/utils/dummy_sentencepiece_objects.py
@@ -74,6 +74,15 @@ class T5Tokenizer:
         requires_sentencepiece(self)
 
 
+class XLMProphetNetTokenizer:
+    def __init__(self, *args, **kwargs):
+        requires_sentencepiece(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_sentencepiece(self)
+
+
 class XLMRobertaTokenizer:
     def __init__(self, *args, **kwargs):
         requires_sentencepiece(self)
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 95bf1b3fea..49bf7d5d47 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -112,6 +112,7 @@ class ModelTesterMixin:
             model.eval()
             with torch.no_grad():
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
             out_2 = outputs[0].cpu().numpy()
             out_2[np.isnan(out_2)] = 0
 
@@ -152,6 +153,7 @@ class ModelTesterMixin:
             with torch.no_grad():
                 first = model(**self._prepare_for_class(inputs_dict, model_class))[0]
                 second = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+
             out_1 = first.cpu().numpy()
             out_2 = second.cpu().numpy()
             out_1 = out_1[~np.isnan(out_1)]
@@ -183,10 +185,12 @@ class ModelTesterMixin:
 
     def test_attention_outputs(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
         seq_len = getattr(self.model_tester, "seq_length", None)
         decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
         encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length)
+        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
         encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
         chunk_length = getattr(self.model_tester, "chunk_length", None)
         if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
@@ -211,7 +215,7 @@ class ModelTesterMixin:
             model.eval()
             with torch.no_grad():
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class), return_dict=True)
-            attentions = outputs[-1]
+            attentions = outputs["attentions"] if "attentions" in outputs.keys() else outputs[-1]
             self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
 
             if chunk_length is not None:
@@ -227,8 +231,14 @@ class ModelTesterMixin:
             out_len = len(outputs)
 
             if self.is_encoder_decoder:
-                correct_outlen = 4
-                decoder_attention_idx = 1
+                correct_outlen = (
+                    self.model_tester.base_model_out_len if hasattr(self.model_tester, "base_model_out_len") else 4
+                )
+                decoder_attention_idx = (
+                    self.model_tester.decoder_attention_idx
+                    if hasattr(self.model_tester, "decoder_attention_idx")
+                    else 1
+                )
 
                 # loss is at first position
                 if "labels" in inputs_dict:
@@ -238,6 +248,7 @@ class ModelTesterMixin:
                 if model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
                     correct_outlen += 1  # start_logits and end_logits instead of only 1 output
                     decoder_attention_idx += 1
+
                 self.assertEqual(out_len, correct_outlen)
 
                 decoder_attentions = outputs[decoder_attention_idx]
@@ -256,9 +267,16 @@ class ModelTesterMixin:
             model.eval()
             with torch.no_grad():
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
 
-            self_attentions = outputs[-1]
+            if hasattr(self.model_tester, "num_hidden_states_types"):
+                added_hidden_states = self.model_tester.num_hidden_states_types
+            elif self.is_encoder_decoder:
+                added_hidden_states = 2
+            else:
+                added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs["attentions"] if "attentions" in outputs else outputs[-1]
             self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
             if chunk_length is not None:
                 self.assertListEqual(
@@ -571,8 +589,8 @@ class ModelTesterMixin:
             model.eval()
 
             with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            hidden_states = outputs[-1]
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class), return_dict=True)
+            hidden_states = outputs["hidden_states"] if "hidden_states" in outputs else outputs[-1]
 
             expected_num_layers = getattr(
                 self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
@@ -822,6 +840,7 @@ class ModelTesterMixin:
             model.eval()
 
             inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
             if not self.is_encoder_decoder:
                 input_ids = inputs["input_ids"]
                 del inputs["input_ids"]
@@ -839,7 +858,7 @@ class ModelTesterMixin:
                 inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
 
             with torch.no_grad():
-                model(**inputs)
+                model(**inputs)[0]
 
     def test_lm_head_model_random_no_beam_search_generate(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/test_modeling_encoder_decoder.py b/tests/test_modeling_encoder_decoder.py
index cdcf510a88..118e40761a 100644
--- a/tests/test_modeling_encoder_decoder.py
+++ b/tests/test_modeling_encoder_decoder.py
@@ -24,6 +24,7 @@ from .test_modeling_bert import BertModelTester
 from .test_modeling_bert_generation import BertGenerationEncoderTester
 from .test_modeling_common import ids_tensor
 from .test_modeling_gpt2 import GPT2ModelTester
+from .test_modeling_prophetnet import ProphetNetStandaloneDecoderModelTester
 from .test_modeling_roberta import RobertaModelTester
 
 
@@ -41,9 +42,11 @@ if is_torch_available():
         EncoderDecoderConfig,
         EncoderDecoderModel,
         GPT2LMHeadModel,
+        ProphetNetForCausalLM,
         RobertaForCausalLM,
         RobertaModel,
     )
+    from transformers.modeling_outputs import BaseModelOutput
 
 
 @require_torch
@@ -82,10 +85,15 @@ class EncoderDecoderMixin:
             decoder_input_ids=decoder_input_ids,
             attention_mask=attention_mask,
             decoder_attention_mask=decoder_attention_mask,
+            return_dict=True,
         )
 
-        self.assertEqual(outputs_encoder_decoder[0].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)))
-        self.assertEqual(outputs_encoder_decoder[1].shape, (input_ids.shape + (config.hidden_size,)))
+        self.assertEqual(
+            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
+        )
+        self.assertEqual(
+            outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,))
+        )
 
     def check_encoder_decoder_model(
         self,
@@ -109,20 +117,30 @@ class EncoderDecoderMixin:
             decoder_input_ids=decoder_input_ids,
             attention_mask=attention_mask,
             decoder_attention_mask=decoder_attention_mask,
+            return_dict=True,
+        )
+        self.assertEqual(
+            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
+        )
+        self.assertEqual(
+            outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,))
         )
 
-        self.assertEqual(outputs_encoder_decoder[0].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)))
-        self.assertEqual(outputs_encoder_decoder[1].shape, (input_ids.shape + (config.hidden_size,)))
-        encoder_outputs = (encoder_hidden_states,)
+        encoder_outputs = BaseModelOutput(last_hidden_state=encoder_hidden_states)
         outputs_encoder_decoder = enc_dec_model(
             encoder_outputs=encoder_outputs,
             decoder_input_ids=decoder_input_ids,
             attention_mask=attention_mask,
             decoder_attention_mask=decoder_attention_mask,
+            return_dict=True,
         )
 
-        self.assertEqual(outputs_encoder_decoder[0].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)))
-        self.assertEqual(outputs_encoder_decoder[1].shape, (input_ids.shape + (config.hidden_size,)))
+        self.assertEqual(
+            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
+        )
+        self.assertEqual(
+            outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,))
+        )
 
     def check_encoder_decoder_model_from_pretrained(
         self,
@@ -145,10 +163,15 @@ class EncoderDecoderMixin:
             decoder_input_ids=decoder_input_ids,
             attention_mask=attention_mask,
             decoder_attention_mask=decoder_attention_mask,
+            return_dict=True,
         )
 
-        self.assertEqual(outputs_encoder_decoder[0].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)))
-        self.assertEqual(outputs_encoder_decoder[1].shape, (input_ids.shape + (config.hidden_size,)))
+        self.assertEqual(
+            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
+        )
+        self.assertEqual(
+            outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,))
+        )
 
     def check_save_and_load(
         self,
@@ -255,14 +278,19 @@ class EncoderDecoderMixin:
             attention_mask=attention_mask,
             decoder_attention_mask=decoder_attention_mask,
             labels=labels,
+            return_dict=True,
         )
 
-        mlm_loss = outputs_encoder_decoder[0]
+        loss = outputs_encoder_decoder["loss"]
         # check that backprop works
-        mlm_loss.backward()
+        loss.backward()
 
-        self.assertEqual(outputs_encoder_decoder[1].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)))
-        self.assertEqual(outputs_encoder_decoder[2].shape, (input_ids.shape + (config.hidden_size,)))
+        self.assertEqual(
+            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
+        )
+        self.assertEqual(
+            outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,))
+        )
 
     def check_encoder_decoder_model_generate(self, input_ids, config, decoder_config, **kwargs):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
@@ -425,6 +453,7 @@ class EncoderDecoderMixin:
                 self.assertLessEqual(max_diff, 1e-5)
 
 
+@require_torch
 class BertEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
     def get_pretrained_model(self):
         return EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "bert-base-cased")
@@ -493,6 +522,7 @@ class BertEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
         self.assertEqual(summary, EXPECTED_SUMMARY)
 
 
+@require_torch
 class BertGenerationEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
     def get_pretrained_model(self):
         return EncoderDecoderModel.from_encoder_decoder_pretrained(
@@ -554,6 +584,7 @@ class BertGenerationEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCa
         self.assertEqual(summary, EXPECTED_SUMMARY)
 
 
+@require_torch
 class RoBertaEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
     def get_encoder_decoder_model(self, config, decoder_config):
         encoder_model = RobertaModel(config)
@@ -606,6 +637,7 @@ class RoBertaEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
         return EncoderDecoderModel.from_encoder_decoder_pretrained("roberta-base", "roberta-base")
 
 
+@require_torch
 class GPT2EncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
     def get_encoder_decoder_model(self, config, decoder_config):
         encoder_model = BertModel(config)
@@ -663,3 +695,59 @@ class GPT2EncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
 
     def test_encoder_decoder_model_shared_weights(self):
         pass
+
+
+@require_torch
+class ProphetNetEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
+    def get_encoder_decoder_model(self, config, decoder_config):
+        encoder_model = BertModel(config)
+        decoder_model = ProphetNetForCausalLM(decoder_config)
+        return encoder_model, decoder_model
+
+    def prepare_config_and_inputs(self):
+        model_tester_encoder = BertModelTester(self, batch_size=13)
+        model_tester_decoder = ProphetNetStandaloneDecoderModelTester(
+            self, batch_size=13, hidden_size=32, max_position_embeddings=512
+        )
+        encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs()
+        decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs_for_decoder()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = encoder_config_and_inputs
+        (
+            decoder_config,
+            decoder_input_ids,
+            decoder_attention_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            lm_labels,
+        ) = decoder_config_and_inputs
+
+        # make sure that cross attention layers are added
+        decoder_config.add_cross_attention = True
+        #  disable cache for now
+        decoder_config.use_cache = False
+        return {
+            "config": config,
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "decoder_config": decoder_config,
+            "decoder_input_ids": decoder_input_ids,
+            "decoder_attention_mask": decoder_attention_mask,
+            "encoder_hidden_states": encoder_hidden_states,
+            "labels": lm_labels,
+        }
+
+    def get_pretrained_model(self):
+        return EncoderDecoderModel.from_encoder_decoder_pretrained(
+            "bert-large-uncased", "patrickvonplaten/prophetnet-decoder-clm-large-uncased"
+        )
+
+    def test_encoder_decoder_model_shared_weights(self):
+        pass
diff --git a/tests/test_modeling_prophetnet.py b/tests/test_modeling_prophetnet.py
new file mode 100644
index 0000000000..bac4776271
--- /dev/null
+++ b/tests/test_modeling_prophetnet.py
@@ -0,0 +1,1075 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team, The Microsoft Research team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import copy
+import tempfile
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        ProphetNetConfig,
+        ProphetNetDecoder,
+        ProphetNetEncoder,
+        ProphetNetForCausalLM,
+        ProphetNetForConditionalGeneration,
+        ProphetNetModel,
+        ProphetNetTokenizer,
+    )
+
+
+class ProphetNetModelTester:
+    def __init__(
+        self,
+        parent,
+        vocab_size=99,
+        batch_size=13,
+        hidden_size=16,
+        encoder_seq_length=7,
+        decoder_seq_length=9,
+        # For common tests
+        is_training=True,
+        use_attention_mask=True,
+        use_labels=True,
+        decoder_start_token_id=0,
+        encoder_ffn_dim=32,
+        num_encoder_layers=4,
+        num_encoder_attention_heads=4,
+        decoder_ffn_dim=32,
+        num_decoder_layers=4,
+        num_decoder_attention_heads=4,
+        max_position_embeddings=30,
+        is_encoder_decoder=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        ngram=2,
+        num_buckets=32,
+        relative_max_distance=128,
+        disable_ngram_loss=False,
+        scope=None,
+    ):
+
+        self.parent = parent
+        self.batch_size = batch_size
+        self.encoder_seq_length = encoder_seq_length
+        self.decoder_seq_length = decoder_seq_length
+        # For common tests
+        self.seq_length = self.decoder_seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.use_labels = use_labels
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_decoder_layers
+        self.num_encoder_layers = num_encoder_layers
+        self.num_decoder_layers = num_decoder_layers
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.num_attention_heads = num_decoder_attention_heads
+        self.num_encoder_attention_heads = num_encoder_attention_heads
+        self.num_decoder_attention_heads = num_decoder_attention_heads
+        self.eos_token_id = eos_token_id
+        self.bos_token_id = bos_token_id
+        self.pad_token_id = pad_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.ngram = ngram
+        self.num_buckets = num_buckets
+        self.relative_max_distance = relative_max_distance
+        self.disable_ngram_loss = disable_ngram_loss
+        self.max_position_embeddings = max_position_embeddings
+        self.is_encoder_decoder = is_encoder_decoder
+
+        self.scope = None
+        self.decoder_key_length = decoder_seq_length
+        self.base_model_out_len = 7
+        self.num_hidden_states_types = 3  # encoder, decoder_main, decoder_ngram
+        self.decoder_attention_idx = 2
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
+        decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        attention_mask = None
+        decoder_attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
+            decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
+
+        lm_labels = None
+        if self.use_labels:
+            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        config = ProphetNetConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_encoder_layers=self.num_encoder_layers,
+            num_decoder_layers=self.num_decoder_layers,
+            decoder_ffn_dim=self.decoder_ffn_dim,
+            encoder_ffn_dim=self.encoder_ffn_dim,
+            num_encoder_attention_heads=self.num_encoder_attention_heads,
+            num_decoder_attention_heads=self.num_decoder_attention_heads,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
+            ngram=self.ngram,
+            num_buckets=self.num_buckets,
+            relative_max_distance=self.relative_max_distance,
+            disable_ngram_loss=self.disable_ngram_loss,
+            max_position_embeddings=self.max_position_embeddings,
+            is_encoder_decoder=self.is_encoder_decoder,
+            return_dict=True,
+        )
+
+        return (
+            config,
+            input_ids,
+            decoder_input_ids,
+            attention_mask,
+            decoder_attention_mask,
+            lm_labels,
+        )
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            decoder_input_ids,
+            attention_mask,
+            decoder_attention_mask,
+            lm_labels,
+        ) = self.prepare_config_and_inputs()
+
+        encoder_hidden_states = floats_tensor([self.batch_size, self.encoder_seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
+
+        return (
+            config,
+            decoder_input_ids,
+            decoder_attention_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            lm_labels,
+        )
+
+    def check_prepare_lm_labels_via_shift_left(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = ProphetNetModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # make sure that lm_labels are correctly padded from the right
+        lm_labels.masked_fill_((lm_labels == self.decoder_start_token_id), self.eos_token_id)
+
+        # add casaul pad token mask
+        triangular_mask = torch.tril(lm_labels.new_ones(lm_labels.shape)).logical_not()
+        lm_labels.masked_fill_(triangular_mask, self.pad_token_id)
+        decoder_input_ids = model._shift_right(lm_labels)
+
+        for i, (decoder_input_ids_slice, lm_labels_slice) in enumerate(zip(decoder_input_ids, lm_labels)):
+            # first item
+            self.parent.assertEqual(decoder_input_ids_slice[0].item(), self.decoder_start_token_id)
+            if i < decoder_input_ids_slice.shape[-1]:
+                if i < decoder_input_ids.shape[-1] - 1:
+                    # items before diagonal
+                    self.parent.assertListEqual(
+                        decoder_input_ids_slice[1 : i + 1].tolist(), lm_labels_slice[:i].tolist()
+                    )
+                # pad items after diagonal
+                if i < decoder_input_ids.shape[-1] - 2:
+                    self.parent.assertListEqual(
+                        decoder_input_ids_slice[i + 2 :].tolist(), lm_labels_slice[i + 1 : -1].tolist()
+                    )
+            else:
+                # all items after square
+                self.parent.assertListEqual(decoder_input_ids_slice[1:].tolist(), lm_labels_slice[:-1].tolist())
+
+    def create_and_check_model(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = ProphetNetModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids=input_ids,
+            decoder_input_ids=decoder_input_ids,
+            attention_mask=attention_mask,
+            decoder_attention_mask=decoder_attention_mask,
+        )
+        result = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        decoder_output = result.last_hidden_state
+        decoder_past = result.past_key_values
+        encoder_output = result.encoder_last_hidden_state
+
+        self.parent.assertEqual(encoder_output.size(), (self.batch_size, self.encoder_seq_length, self.hidden_size))
+        self.parent.assertEqual(decoder_output.size(), (self.batch_size, self.decoder_seq_length, self.hidden_size))
+        # There should be `num_layers` key value embeddings stored in decoder_past
+        self.parent.assertEqual(len(decoder_past), config.num_decoder_layers)
+        # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past tuple
+        self.parent.assertEqual(len(decoder_past[0]), 2)  # cross-attention + uni-directional self-attention
+
+    def create_and_check_with_lm_head(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = ProphetNetForConditionalGeneration(config=config).to(torch_device).eval()
+        outputs = model(
+            input_ids=input_ids,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            labels=lm_labels,
+        )
+        self.parent.assertEqual(len(outputs), 5)
+        self.parent.assertEqual(outputs["logits"].size(), (self.batch_size, self.decoder_seq_length, self.vocab_size))
+        self.parent.assertEqual(outputs["loss"].size(), ())
+
+    def create_and_check_causal_lm_decoder(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = ProphetNetForCausalLM(config=config).to(torch_device).eval()
+        outputs = model(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            labels=lm_labels,
+        )
+        self.parent.assertEqual(len(outputs), 4)
+        self.parent.assertEqual(outputs["logits"].size(), (self.batch_size, self.decoder_seq_length, self.vocab_size))
+        self.parent.assertEqual(outputs["loss"].size(), ())
+
+    def create_and_check_generate_with_past_key_value_states(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = ProphetNetForConditionalGeneration(config=config).to(torch_device).eval()
+        torch.manual_seed(0)
+        output_without_past_cache = model.generate(
+            input_ids[:1], num_beams=2, max_length=5, do_sample=True, use_cache=False
+        )
+        torch.manual_seed(0)
+        output_with_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=5, do_sample=True)
+        self.parent.assertTrue(torch.all(output_with_past_cache == output_without_past_cache))
+
+    def create_and_check_model_fp16_forward(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = ProphetNetModel(config=config).to(torch_device).half().eval()
+        output = model(input_ids, decoder_input_ids=input_ids, attention_mask=attention_mask)["last_hidden_state"]
+        self.parent.assertFalse(torch.isnan(output).any().item())
+
+    def create_and_check_encoder_decoder_shared_weights(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        for model_class in [ProphetNetModel, ProphetNetForConditionalGeneration]:
+            torch.manual_seed(0)
+            model = model_class(config=config).to(torch_device).eval()
+            # load state dict copies weights but does not tie them
+
+            if model_class == ProphetNetForConditionalGeneration:
+                model.prophetnet.encoder.load_state_dict(model.prophetnet.decoder.state_dict(), strict=False)
+            else:
+                model.encoder.load_state_dict(model.decoder.state_dict(), strict=False)
+
+            torch.manual_seed(0)
+            tied_config = copy.deepcopy(config)
+            tied_config.tie_encoder_decoder = True
+            tied_model = model_class(config=tied_config).to(torch_device).eval()
+
+            model_result = model(
+                input_ids=input_ids,
+                decoder_input_ids=decoder_input_ids,
+                attention_mask=attention_mask,
+                decoder_attention_mask=decoder_attention_mask,
+                return_dict=True,
+            )
+
+            tied_model_result = tied_model(
+                input_ids=input_ids,
+                decoder_input_ids=decoder_input_ids,
+                attention_mask=attention_mask,
+                decoder_attention_mask=decoder_attention_mask,
+                return_dict=True,
+            )
+
+            # check that models has less parameters
+            self.parent.assertLess(
+                sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())
+            )
+            random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item()
+
+            # check that outputs are equal
+            self.parent.assertTrue(
+                torch.allclose(
+                    model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx], atol=1e-4
+                )
+            )
+
+            # check that outputs after saving and loading are equal
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                tied_model.save_pretrained(tmpdirname)
+                tied_model = model_class.from_pretrained(tmpdirname)
+                tied_model.to(torch_device)
+                tied_model.eval()
+
+                # check that models has less parameters
+                self.parent.assertLess(
+                    sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())
+                )
+                random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item()
+
+                tied_model_result = tied_model(
+                    input_ids=input_ids,
+                    decoder_input_ids=decoder_input_ids,
+                    attention_mask=attention_mask,
+                    decoder_attention_mask=decoder_attention_mask,
+                )
+
+                # check that outputs are equal
+                self.parent.assertTrue(
+                    torch.allclose(
+                        model_result[0][0, :, random_slice_idx],
+                        tied_model_result[0][0, :, random_slice_idx],
+                        atol=1e-4,
+                    )
+                )
+
+    def check_fast_integration(
+        self,
+        config,
+        *args,
+    ):
+        input_ids = torch.tensor([[7, 4, 78, 0, 24, 52, 43]], device=torch_device, dtype=torch.long)
+        decoder_input_ids = torch.tensor([[12, 62, 25, 11, 47, 15, 14]], device=torch_device, dtype=torch.long)
+        attention_mask = torch.tensor([[1, 1, 1, 0, 1, 0, 0]], device=torch_device, dtype=torch.long)
+        decoder_attention_mask = torch.tensor([[1, 1, 1, 0, 0, 1, 0]], device=torch_device, dtype=torch.long)
+        lm_labels = torch.tensor([[62, 25, 11, 47, 15, 14, 24]], device=torch_device, dtype=torch.long)
+        torch.manual_seed(0)
+        config.ngram = 4
+        model = ProphetNetForConditionalGeneration(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(
+                input_ids=input_ids,
+                decoder_input_ids=decoder_input_ids,
+                attention_mask=attention_mask,
+                decoder_attention_mask=decoder_attention_mask,
+                labels=lm_labels,
+                return_dict=True,
+            )
+        self.parent.assertTrue(torch.allclose(result.loss, torch.tensor(128.2925, device=torch_device), atol=1e-3))
+
+        expected_logit_slice = torch.tensor(
+            [-0.1565, 0.0418, 0.1207, 0.0030, 0.0665, 0.0467, 0.0412], device=torch_device
+        )
+        self.parent.assertTrue(torch.allclose(result.logits[0, :, 1], expected_logit_slice, atol=1e-3))
+
+    def check_model_with_attn_mask(self, config, input_ids, decoder_input_ids, *args):
+        model = ProphetNetModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        outputs_no_mask = model(
+            input_ids=input_ids[:, :5], decoder_input_ids=decoder_input_ids[:, :5], return_dict=True
+        )
+        attention_mask = torch.ones_like(input_ids)
+        decoder_attention_mask = torch.ones_like(decoder_input_ids)
+
+        attention_mask[:, 5:] = 0
+
+        outputs_with_mask = model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            return_dict=True,
+        )
+
+        # check encoder
+        self.parent.assertTrue(
+            torch.allclose(
+                outputs_no_mask.encoder_last_hidden_state[0, :, 0],
+                outputs_with_mask.encoder_last_hidden_state[0, :5, 0],
+                atol=1e-3,
+            )
+        )
+
+        # check decoder
+        # main stream
+        self.parent.assertTrue(
+            torch.allclose(
+                outputs_no_mask.last_hidden_state[0, :, 0], outputs_with_mask.last_hidden_state[0, :5, 0], atol=1e-3
+            )
+        )
+        # predict stream
+        self.parent.assertTrue(
+            torch.allclose(
+                outputs_no_mask.last_hidden_state_ngram[0, :5, 0],
+                outputs_with_mask.last_hidden_state_ngram[0, :5, 0],
+                atol=1e-3,
+            )
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            decoder_input_ids,
+            attention_mask,
+            decoder_attention_mask,
+            lm_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "decoder_input_ids": decoder_input_ids,
+            "decoder_attention_mask": decoder_attention_mask,
+            "use_cache": False,
+        }
+        return config, inputs_dict
+
+
+class ProphetNetStandaloneDecoderModelTester:
+    def __init__(
+        self,
+        parent,
+        vocab_size=99,
+        batch_size=13,
+        hidden_size=16,
+        encoder_seq_length=7,
+        decoder_seq_length=7,
+        # For common tests
+        is_training=True,
+        is_decoder=True,
+        use_attention_mask=True,
+        add_cross_attention=False,
+        use_cache=False,
+        use_labels=True,
+        decoder_start_token_id=0,
+        encoder_ffn_dim=32,
+        num_encoder_layers=4,
+        num_encoder_attention_heads=4,
+        decoder_ffn_dim=32,
+        num_decoder_layers=4,
+        num_decoder_attention_heads=4,
+        max_position_embeddings=30,
+        is_encoder_decoder=False,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        ngram=2,
+        return_dict=True,
+        num_buckets=32,
+        relative_max_distance=128,
+        disable_ngram_loss=False,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.encoder_seq_length = encoder_seq_length
+        self.decoder_seq_length = decoder_seq_length
+        # For common tests
+        self.seq_length = self.decoder_seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.use_labels = use_labels
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_decoder_layers
+        self.num_encoder_layers = num_encoder_layers
+        self.num_decoder_layers = num_decoder_layers
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.num_attention_heads = num_decoder_attention_heads
+        self.num_encoder_attention_heads = num_encoder_attention_heads
+        self.num_decoder_attention_heads = num_decoder_attention_heads
+        self.eos_token_id = eos_token_id
+        self.bos_token_id = bos_token_id
+        self.pad_token_id = pad_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.ngram = ngram
+        self.num_buckets = num_buckets
+        self.relative_max_distance = relative_max_distance
+        self.use_cache = use_cache
+        self.disable_ngram_loss = disable_ngram_loss
+        self.max_position_embeddings = max_position_embeddings
+        self.add_cross_attention = add_cross_attention
+        self.is_encoder_decoder = is_encoder_decoder
+        self.return_dict = return_dict
+
+        self.scope = None
+        self.decoder_key_length = decoder_seq_length
+        self.base_model_out_len = 2
+        self.num_hidden_states_types = 2  # decoder_main, decoder_ngram
+        self.decoder_attention_idx = 1
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
+
+        attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
+
+        lm_labels = None
+        if self.use_labels:
+            lm_labels = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
+
+        config = ProphetNetConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_encoder_layers=self.num_encoder_layers,
+            num_decoder_layers=self.num_decoder_layers,
+            decoder_ffn_dim=self.decoder_ffn_dim,
+            encoder_ffn_dim=self.encoder_ffn_dim,
+            num_encoder_attention_heads=self.num_encoder_attention_heads,
+            num_decoder_attention_heads=self.num_decoder_attention_heads,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            use_cache=self.use_cache,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
+            ngram=self.ngram,
+            num_buckets=self.num_buckets,
+            relative_max_distance=self.relative_max_distance,
+            disable_ngram_loss=self.disable_ngram_loss,
+            max_position_embeddings=self.max_position_embeddings,
+            add_cross_attention=self.add_cross_attention,
+            is_encoder_decoder=self.is_encoder_decoder,
+            return_dict=self.return_dict,
+        )
+
+        return (
+            config,
+            input_ids,
+            attention_mask,
+            lm_labels,
+        )
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            attention_mask,
+            lm_labels,
+        ) = self.prepare_config_and_inputs()
+
+        encoder_hidden_states = floats_tensor([self.batch_size, self.encoder_seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            attention_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            lm_labels,
+        )
+
+    def create_and_check_decoder_model_past(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        lm_labels,
+    ):
+        config.use_cache = True
+        model = ProphetNetDecoder(config=config).to(torch_device).eval()
+        # first forward pass
+        outputs = model(input_ids, use_cache=True)
+        outputs_use_cache_conf = model(input_ids)
+        outputs_no_past = model(input_ids, use_cache=False)
+
+        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
+        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
+
+        past_key_values = outputs["past_key_values"]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+
+        output_from_no_past = model(next_input_ids)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
+
+    def create_and_check_decoder_model_attention_mask_past(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        lm_labels,
+    ):
+        model = ProphetNetDecoder(config=config).to(torch_device).eval()
+
+        # create attention mask
+        attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+
+        half_seq_length = input_ids.shape[-1] // 2
+        attn_mask[:, half_seq_length:] = 0
+
+        # first forward pass
+        past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # change a random masked slice from input_ids
+        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
+        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
+        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
+
+        # append to next input_ids and attn_mask
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        attn_mask = torch.cat(
+            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
+            dim=1,
+        )
+
+        # get two different outputs
+        output_from_no_past = model(next_input_ids)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            attention_mask,
+            lm_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+
+class ProphetNetStandaloneEncoderModelTester:
+    def __init__(
+        self,
+        parent,
+        vocab_size=99,
+        batch_size=13,
+        hidden_size=16,
+        encoder_seq_length=7,
+        decoder_seq_length=7,
+        # For common tests
+        is_training=True,
+        is_decoder=False,
+        use_attention_mask=True,
+        add_cross_attention=False,
+        use_cache=False,
+        use_labels=True,
+        decoder_start_token_id=0,
+        encoder_ffn_dim=32,
+        num_encoder_layers=4,
+        num_encoder_attention_heads=4,
+        decoder_ffn_dim=32,
+        num_decoder_layers=4,
+        num_decoder_attention_heads=4,
+        max_position_embeddings=30,
+        is_encoder_decoder=False,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        return_dict=True,
+        num_buckets=32,
+        relative_max_distance=128,
+        disable_ngram_loss=False,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.encoder_seq_length = encoder_seq_length
+        self.decoder_seq_length = decoder_seq_length
+        # For common tests
+        self.seq_length = self.decoder_seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.use_labels = use_labels
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_decoder_layers
+        self.num_encoder_layers = num_encoder_layers
+        self.num_decoder_layers = num_decoder_layers
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.num_attention_heads = num_decoder_attention_heads
+        self.num_encoder_attention_heads = num_encoder_attention_heads
+        self.num_decoder_attention_heads = num_decoder_attention_heads
+        self.eos_token_id = eos_token_id
+        self.bos_token_id = bos_token_id
+        self.pad_token_id = pad_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.num_buckets = num_buckets
+        self.relative_max_distance = relative_max_distance
+        self.use_cache = use_cache
+        self.disable_ngram_loss = disable_ngram_loss
+        self.max_position_embeddings = max_position_embeddings
+        self.add_cross_attention = add_cross_attention
+        self.is_encoder_decoder = is_encoder_decoder
+        self.return_dict = return_dict
+
+        self.scope = None
+        self.decoder_key_length = decoder_seq_length
+        self.base_model_out_len = 1
+        self.num_hidden_states_types = 1
+        self.decoder_attention_idx = 1
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
+
+        attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
+
+        config = ProphetNetConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_encoder_layers=self.num_encoder_layers,
+            num_decoder_layers=self.num_decoder_layers,
+            decoder_ffn_dim=self.decoder_ffn_dim,
+            encoder_ffn_dim=self.encoder_ffn_dim,
+            num_encoder_attention_heads=self.num_encoder_attention_heads,
+            num_decoder_attention_heads=self.num_decoder_attention_heads,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            use_cache=self.use_cache,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
+            num_buckets=self.num_buckets,
+            relative_max_distance=self.relative_max_distance,
+            disable_ngram_loss=self.disable_ngram_loss,
+            max_position_embeddings=self.max_position_embeddings,
+            add_cross_attention=self.add_cross_attention,
+            is_encoder_decoder=self.is_encoder_decoder,
+            return_dict=self.return_dict,
+        )
+
+        return (
+            config,
+            input_ids,
+            attention_mask,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            attention_mask,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class ProphetNetModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (ProphetNetModel, ProphetNetForConditionalGeneration) if is_torch_available() else ()
+    all_generative_model_classes = (ProphetNetForConditionalGeneration,) if is_torch_available() else ()
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = False
+    test_headmasking = False
+    is_encoder_decoder = True
+
+    def setUp(self):
+        self.model_tester = ProphetNetModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ProphetNetConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_lm_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_with_lm_head(*config_and_inputs)
+
+    def test_only_decoder_causal_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_causal_lm_decoder(*config_and_inputs)
+
+    def test_fast_integration(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_fast_integration(*config_and_inputs)
+
+    def test_shared_weights(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_encoder_decoder_shared_weights(*config_and_inputs)
+
+    def test_shift_labels_via_shift_left(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_prepare_lm_labels_via_shift_left(*config_and_inputs)
+
+    def test_decoder_model_generate(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_generate_with_past_key_value_states(*config_and_inputs)
+
+    def test_attn_mask_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_model_with_attn_mask(*config_and_inputs)
+
+    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    def test_fp16_forward(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
+
+
+@require_torch
+class ProphetNetStandaloneDecoderModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (ProphetNetDecoder, ProphetNetForCausalLM) if is_torch_available() else ()
+    all_generative_model_classes = (ProphetNetForCausalLM,) if is_torch_available() else ()
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = False
+    test_headmasking = False
+    is_encoder_decoder = False
+
+    def setUp(self):
+        self.model_tester = ProphetNetStandaloneDecoderModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ProphetNetConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_decoder_model_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
+
+    def test_decoder_model_attn_mask_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
+
+
+@require_torch
+class ProphetNetStandaloneEncoderModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (ProphetNetEncoder,) if is_torch_available() else ()
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = False
+    test_headmasking = False
+    is_encoder_decoder = False
+
+    def setUp(self):
+        self.model_tester = ProphetNetStandaloneEncoderModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ProphetNetConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+
+class ProphetNetModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_pretrained_checkpoint_hidden_states(self):
+        model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased")
+        model.to(torch_device)
+
+        # encoder-decoder outputs
+        encoder_ids = torch.tensor(
+            [
+                [
+                    2871,
+                    102,
+                    2048,
+                    3176,
+                    2780,
+                    1997,
+                    2871,
+                    26727,
+                    2169,
+                    2097,
+                    12673,
+                    1996,
+                    8457,
+                    2006,
+                    2049,
+                    8240,
+                    2859,
+                    2799,
+                    1012,
+                    2023,
+                    6512,
+                    2038,
+                    2174,
+                    13977,
+                    2195,
+                    25962,
+                    1012,
+                    102,
+                ]
+            ]
+        ).to(torch_device)
+
+        decoder_prev_ids = torch.tensor([[102, 2129, 2116, 2372, 2024, 2006, 2169, 1997, 2122, 2048, 2780, 1029]]).to(
+            torch_device
+        )
+        output = model(
+            input_ids=encoder_ids,
+            attention_mask=None,
+            encoder_outputs=None,
+            decoder_input_ids=decoder_prev_ids,
+            return_dict=True,
+        )
+        output_predited_logits = output[0]
+        expected_shape = torch.Size((1, 12, 30522))
+        self.assertEqual(output_predited_logits.shape, expected_shape)
+        expected_slice = torch.tensor(
+            [[[-7.6213, -7.9008, -7.9979], [-7.6834, -7.8467, -8.2187], [-7.5326, -7.4762, -8.1914]]]
+        ).to(torch_device)
+        #        self.assertTrue(torch.allclose(output_predited_logits[:, :3, :3], expected_slice, atol=1e-4))
+        assert torch.allclose(output_predited_logits[:, :3, :3], expected_slice, atol=1e-4)
+
+        # encoder outputs
+        encoder_outputs = model.prophetnet.encoder(encoder_ids)[0]
+        expected_encoder_outputs_slice = torch.tensor(
+            [[[-0.2526, -0.1951, -0.2185], [-0.8923, 0.2992, -0.4623], [-0.4585, 0.0165, -0.6652]]]
+        ).to(torch_device)
+        expected_shape_encoder = torch.Size((1, 28, 1024))
+        self.assertEqual(encoder_outputs.shape, expected_shape_encoder)
+        #        self.assertTrue(torch.allclose(encoder_outputs[:, :3, :3], expected_encoder_outputs_slice, atol=1e-4))
+        assert torch.allclose(encoder_outputs[:, :3, :3], expected_encoder_outputs_slice, atol=1e-4)
+
+        # decoder outputs
+        decoder_outputs = model.prophetnet.decoder(
+            decoder_prev_ids, encoder_hidden_states=encoder_outputs, return_dict=True
+        )
+        predicting_streams = decoder_outputs[1].view(1, model.config.ngram, 12, -1)
+        predicting_streams_logits = model.lm_head(predicting_streams)
+        next_first_stream_logits = predicting_streams_logits[:, 0]
+        #        self.assertTrue(torch.allclose(next_first_stream_logits[:, :3, :3], expected_slice, atol=1e-4))
+        assert torch.allclose(next_first_stream_logits[:, :3, :3], expected_slice, atol=1e-4)
+
+    @slow
+    def test_cnndm_inference(self):
+        model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased-cnndm")
+        model.config.max_length = 512
+        model.to(torch_device)
+
+        tokenizer = ProphetNetTokenizer.from_pretrained("microsoft/prophetnet-large-uncased-cnndm")
+
+        ARTICLE_TO_SUMMARIZE = "USTC was founded in Beijing by the Chinese Academy of Sciences (CAS) in September 1958. The Director of CAS, Mr. Guo Moruo was appointed the first president of USTC. USTC's founding mission was to develop a high-level science and technology workforce, as deemed critical for development of China's economy, defense, and science and technology education. The establishment was hailed as \"A Major Event in the History of Chinese Education and Science.\" CAS has supported USTC by combining most of its institutes with the departments of the university. USTC is listed in the top 16 national key universities, becoming the youngest national key university.".lower()
+        input_ids = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=511, return_tensors="pt").input_ids
+
+        input_ids = input_ids.to(torch_device)
+
+        summary_ids = model.generate(
+            input_ids, num_beams=4, length_penalty=1.0, no_repeat_ngram_size=3, early_stopping=True
+        )
+        EXPECTED_SUMMARIZE_512 = "us ##tc was founded by the chinese academy of sciences ( cas ) in 1958 . [X_SEP] us ##tc is listed in the top 16 national key universities ."
+        generated_titles = [
+            " ".join(tokenizer.convert_ids_to_tokens(g, skip_special_tokens=True)) for g in summary_ids
+        ]
+        self.assertListEqual(
+            [EXPECTED_SUMMARIZE_512],
+            generated_titles,
+        )
+        input_ids = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=99, return_tensors="pt").input_ids
+        input_ids = input_ids.to(torch_device)
+        # actually 98 tokens are used. max_length=100 contains bos and eos.
+        summary_ids = model.generate(
+            input_ids, num_beams=4, length_penalty=1.0, no_repeat_ngram_size=3, early_stopping=True
+        )
+        EXPECTED_SUMMARIZE_100 = (
+            r"us ##tc was founded in beijing by the chinese academy of sciences ( cas ) in 1958 . [X_SEP] us ##tc "
+            "'"
+            ' s founding mission was to develop a high - level science and technology workforce . [X_SEP] establishment hailed as " a major event in the history of chinese education and science "'
+        )
+        generated_titles = [
+            " ".join(tokenizer.convert_ids_to_tokens(g, skip_special_tokens=True)) for g in summary_ids
+        ]
+        self.assertListEqual(
+            [EXPECTED_SUMMARIZE_100],
+            generated_titles,
+        )
diff --git a/tests/test_modeling_t5.py b/tests/test_modeling_t5.py
index 8411032c38..5887ec6802 100644
--- a/tests/test_modeling_t5.py
+++ b/tests/test_modeling_t5.py
@@ -43,7 +43,6 @@ class T5ModelTester:
         encoder_seq_length=7,
         decoder_seq_length=9,
         # For common tests
-        seq_length=7,
         is_training=True,
         use_attention_mask=True,
         use_labels=True,
diff --git a/tests/test_modeling_xlm_prophetnet.py b/tests/test_modeling_xlm_prophetnet.py
new file mode 100644
index 0000000000..4f2c2558a9
--- /dev/null
+++ b/tests/test_modeling_xlm_prophetnet.py
@@ -0,0 +1,141 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team, The Microsoft Research team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import slow, torch_device
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import XLMProphetNetForConditionalGeneration, XLMProphetNetTokenizer
+
+
+class XLMProphetNetModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_pretrained_checkpoint_hidden_states(self):
+        model = XLMProphetNetForConditionalGeneration.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
+        model.to(torch_device)
+
+        # encoder-decoder outputs
+        encoder_ids = torch.tensor([[17, 96208, 103471, 2]]).to(torch_device)
+        decoder_prev_ids = torch.tensor(
+            [[2, 250, 9953, 34, 69489, 1620, 32, 118424, 624, 210, 105, 2913, 1032, 351]]
+        ).to(torch_device)
+        output = model(
+            input_ids=encoder_ids, attention_mask=None, encoder_outputs=None, decoder_input_ids=decoder_prev_ids
+        )
+        output_predited_logis = output[0]
+        expected_shape = torch.Size((1, 14, 250012))
+        self.assertEqual(output_predited_logis.shape, expected_shape)
+        expected_slice = torch.tensor(
+            [[[-6.6042, -8.3838, 12.4717], [-6.4426, -8.1994, 12.4542], [-6.0851, -7.8209, 12.9493]]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(output_predited_logis[:, :3, :3], expected_slice, atol=1e-4))
+
+        # encoder outputs
+        encoder_outputs = model.prophetnet.encoder(encoder_ids)[0]
+        expected_encoder_outputs_slice = torch.tensor(
+            [[[-1.4260, -0.7628, 0.8453], [-1.4719, -0.1391, 0.7807], [-1.7678, 0.0114, 0.4646]]]
+        ).to(torch_device)
+        expected_shape_encoder = torch.Size((1, 4, 1024))
+        self.assertEqual(encoder_outputs.shape, expected_shape_encoder)
+        self.assertTrue(torch.allclose(encoder_outputs[:, :3, :3], expected_encoder_outputs_slice, atol=1e-4))
+
+        # decoder outputs
+        decoder_outputs = model.prophetnet.decoder(
+            decoder_prev_ids,
+            encoder_hidden_states=encoder_outputs,
+        )
+        predicting_streams = decoder_outputs[1].view(1, model.config.ngram, 14, -1)
+        predicting_streams_logits = model.lm_head(predicting_streams)
+        next_first_stream_logits = predicting_streams_logits[:, 0]
+        self.assertTrue(torch.allclose(next_first_stream_logits[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_ntg_hidden_states(self):
+        model = XLMProphetNetForConditionalGeneration.from_pretrained(
+            "microsoft/xprophetnet-large-wiki100-cased-xglue-ntg"
+        )
+        model.to(torch_device)
+
+        encoder_ids = torch.tensor([[17, 96208, 103471, 2]]).to(torch_device)
+        decoder_prev_ids = torch.tensor(
+            [[2, 250, 9953, 34, 69489, 1620, 32, 118424, 624, 210, 105, 2913, 1032, 351]]
+        ).to(torch_device)
+        output = model(
+            input_ids=encoder_ids, attention_mask=None, encoder_outputs=None, decoder_input_ids=decoder_prev_ids
+        )
+        output_predited_logis = output[0]
+        expected_shape = torch.Size((1, 14, 250012))
+        self.assertEqual(output_predited_logis.shape, expected_shape)
+        # compare the actual values for a slice.
+        expected_slice = torch.tensor(
+            [[[-8.8815, -9.2996, -4.4506], [-6.7202, -7.8944, -0.9402], [-8.6890, -7.4528, -1.9437]]]
+        ).to(torch_device)
+
+        self.assertTrue(torch.allclose(output_predited_logis[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_xprophetnet_ntg_inference(self):
+        model = XLMProphetNetForConditionalGeneration.from_pretrained(
+            "microsoft/xprophetnet-large-wiki100-cased-xglue-ntg"
+        )
+        model.to(torch_device)
+        model.config.max_length = 512
+
+        tokenizer = XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased-xglue-ntg")
+
+        EN_SENTENCE = "Microsoft Corporation intends to officially end free support for the Windows 7 operating system after January 14, 2020, according to the official portal of the organization. From that day, users of this system will not be able to receive security updates, which could make their computers vulnerable to cyber attacks."
+        RU_SENTENCE = "орпорация Microsoft намерена официально прекратить бесплатную поддержку операционной системы Windows 7 после 14 января 2020 года, сообщается на официальном портале организации . С указанного дня пользователи этой системы не смогут получать обновления безопасности, из-за чего их компьютеры могут стать уязвимыми к кибератакам."
+        ZH_SENTENCE = (
+            "根据该组织的官方门户网站，微软公司打算在2020年1月14日之后正式终止对Windows 7操作系统的免费支持。从那时起，该系统的用户将无法接收安全更新，这可能会使他们的计算机容易受到网络攻击。"
+        )
+
+        input_ids = tokenizer(
+            [EN_SENTENCE, RU_SENTENCE, ZH_SENTENCE], padding=True, max_length=255, return_tensors="pt"
+        ).input_ids
+        input_ids = input_ids.to(torch_device)
+
+        summary_ids = model.generate(
+            input_ids, num_beams=10, length_penalty=1.0, no_repeat_ngram_size=3, early_stopping=True
+        )
+        generated_titles = [tokenizer.decode(g, skip_special_tokens=True) for g in summary_ids]
+        EXPECTED_TITLE_EN = "Microsoft to end Windows 7 free support after January 14, 2020"
+        EXPECTED_TITLE_RU = "Microsoft намерена прекратить бесплатную поддержку Windows 7 после 14 января 2020 года"
+        EXPECTED_TITLE_ZH = "微软打算终止对Windows 7操作系统的免费支持"
+        self.assertListEqual(
+            [EXPECTED_TITLE_EN, EXPECTED_TITLE_RU, EXPECTED_TITLE_ZH],
+            generated_titles,
+        )
+
+        summary_ids_beam1 = model.generate(
+            input_ids, num_beams=1, length_penalty=1.0, no_repeat_ngram_size=3, early_stopping=True
+        )
+        generated_titles_beam1_tok = [
+            tokenizer.convert_ids_to_tokens(g, skip_special_tokens=True) for g in summary_ids_beam1
+        ]
+        EXPECTED_TITLE_EN_BEAM1_TOK = "▁Microsoft ▁to ▁end ▁free ▁support ▁for ▁Windows ▁7".split(" ")
+        EXPECTED_TITLE_RU_BEAM1_TOK = "▁Microsoft ▁намерен а ▁прекрати ть ▁бес плат ную ▁поддержку ▁Windows ▁7 ▁после ▁14 ▁января ▁2020 ▁года".split(
+            " "
+        )
+        EXPECTED_TITLE_ZH_BEAM1_TOK = "微软 公司 打算 终止 对 Windows ▁7 操作 系统的 免费 支持".split(" ")
+        self.assertListEqual(
+            [EXPECTED_TITLE_EN_BEAM1_TOK, EXPECTED_TITLE_RU_BEAM1_TOK, EXPECTED_TITLE_ZH_BEAM1_TOK],
+            generated_titles_beam1_tok,
+        )
diff --git a/tests/test_tokenization_prophetnet.py b/tests/test_tokenization_prophetnet.py
new file mode 100644
index 0000000000..fc09540908
--- /dev/null
+++ b/tests/test_tokenization_prophetnet.py
@@ -0,0 +1,191 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team, The Microsoft Research team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import unittest
+
+from transformers.testing_utils import slow
+from transformers.tokenization_bert import (
+    BasicTokenizer,
+    WordpieceTokenizer,
+    _is_control,
+    _is_punctuation,
+    _is_whitespace,
+)
+from transformers.tokenization_prophetnet import VOCAB_FILES_NAMES, ProphetNetTokenizer
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = ProphetNetTokenizer
+    test_rust_tokenizer = False
+
+    def setUp(self):
+        super().setUp()
+
+        vocab_tokens = [
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "[PAD]",
+            "[MASK]",
+            "want",
+            "##want",
+            "##ed",
+            "wa",
+            "un",
+            "runn",
+            "##ing",
+            ",",
+            "low",
+            "lowest",
+        ]
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "UNwant\u00E9d,running"
+        output_text = "unwanted, running"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = self.tokenizer_class(self.vocab_file)
+
+        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
+        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
+
+    def test_chinese(self):
+        tokenizer = BasicTokenizer()
+
+        self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
+
+    def test_basic_tokenizer_lower(self):
+        tokenizer = BasicTokenizer(do_lower_case=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+
+    def test_basic_tokenizer_lower_strip_accents_false(self):
+        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])
+
+    def test_basic_tokenizer_lower_strip_accents_true(self):
+        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+
+    def test_basic_tokenizer_lower_strip_accents_default(self):
+        tokenizer = BasicTokenizer(do_lower_case=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+
+    def test_basic_tokenizer_no_lower(self):
+        tokenizer = BasicTokenizer(do_lower_case=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
+        )
+
+    def test_basic_tokenizer_no_lower_strip_accents_false(self):
+        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HäLLo", "!", "how", "Are", "yoU", "?"]
+        )
+
+    def test_basic_tokenizer_no_lower_strip_accents_true(self):
+        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HaLLo", "!", "how", "Are", "yoU", "?"]
+        )
+
+    def test_basic_tokenizer_respects_never_split_tokens(self):
+        tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
+        )
+
+    def test_wordpiece_tokenizer(self):
+        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
+
+        vocab = {}
+        for (i, token) in enumerate(vocab_tokens):
+            vocab[token] = i
+        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
+
+        self.assertListEqual(tokenizer.tokenize(""), [])
+
+        self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"])
+
+        self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
+
+    def test_is_whitespace(self):
+        self.assertTrue(_is_whitespace(" "))
+        self.assertTrue(_is_whitespace("\t"))
+        self.assertTrue(_is_whitespace("\r"))
+        self.assertTrue(_is_whitespace("\n"))
+        self.assertTrue(_is_whitespace("\u00A0"))
+
+        self.assertFalse(_is_whitespace("A"))
+        self.assertFalse(_is_whitespace("-"))
+
+    def test_is_control(self):
+        self.assertTrue(_is_control("\u0005"))
+
+        self.assertFalse(_is_control("A"))
+        self.assertFalse(_is_control(" "))
+        self.assertFalse(_is_control("\t"))
+        self.assertFalse(_is_control("\r"))
+
+    def test_is_punctuation(self):
+        self.assertTrue(_is_punctuation("-"))
+        self.assertTrue(_is_punctuation("$"))
+        self.assertTrue(_is_punctuation("`"))
+        self.assertTrue(_is_punctuation("."))
+
+        self.assertFalse(_is_punctuation("A"))
+        self.assertFalse(_is_punctuation(" "))
+
+    @slow
+    def test_sequence_builders(self):
+        tokenizer = self.tokenizer_class.from_pretrained("microsoft/prophetnet-large-uncased")
+
+        text = tokenizer.encode("sequence builders", add_special_tokens=False)
+        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        assert encoded_sentence == text + [102]
+        assert encoded_pair == text + [102] + text_2 + [102]
diff --git a/tests/test_tokenization_xlm_prophetnet.py b/tests/test_tokenization_xlm_prophetnet.py
new file mode 100644
index 0000000000..83097ff71d
--- /dev/null
+++ b/tests/test_tokenization_xlm_prophetnet.py
@@ -0,0 +1,125 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team, The Microsoft Research team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import unittest
+
+from transformers.file_utils import cached_property
+from transformers.testing_utils import slow
+from transformers.tokenization_xlm_prophetnet import SPIECE_UNDERLINE, XLMProphetNetTokenizer
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
+
+
+class XLMProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = XLMProphetNetTokenizer
+    test_rust_tokenizer = False
+
+    def setUp(self):
+        super().setUp()
+
+        # We have a SentencePiece fixture for testing
+        tokenizer = XLMProphetNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def test_full_tokenizer(self):
+        tokenizer = XLMProphetNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
+
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
+
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens),
+            [value + tokenizer.fairseq_offset for value in [285, 46, 10, 170, 382]],
+        )
+
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(
+            tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "9",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "é",
+                ".",
+            ],
+        )
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(
+            ids,
+            [
+                value + tokenizer.fairseq_offset
+                for value in [8, 21, 84, 55, 24, 19, 7, -9, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, -9, 4]
+            ],
+        )
+
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(
+            back_tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "[UNK]",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "[UNK]",
+                ".",
+            ],
+        )
+
+    @cached_property
+    def big_tokenizer(self):
+        return XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
+
+    @slow
+    def test_tokenization_base_easy_symbols(self):
+        symbols = "Hello World!"
+        original_tokenizer_encodings = [35389, 6672, 49, 2]
+        self.assertListEqual(original_tokenizer_encodings, self.big_tokenizer.encode(symbols))
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 68a7c6a836..a0ade81f91 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -43,6 +43,7 @@ TEST_FILES_WITH_NO_COMMON_TESTS = [
     "test_modeling_camembert.py",
     "test_modeling_tf_camembert.py",
     "test_modeling_tf_xlm_roberta.py",
+    "test_modeling_xlm_prophetnet.py",
     "test_modeling_xlm_roberta.py",
     "test_modeling_pegasus.py",
     "test_modeling_mbart.py",
@@ -61,6 +62,7 @@ IGNORE_NON_DOCUMENTED = [
 MODEL_NAME_TO_DOC_FILE = {
     "openai": "gpt.rst",
     "transfo_xl": "transformerxl.rst",
+    "xlm_prophetnet": "xlmprophetnet.rst",
     "xlm_roberta": "xlmroberta.rst",
     "bert_generation": "bertgeneration.rst",
 }
@@ -243,6 +245,9 @@ def _get_model_name(module):
     # Secial case for transfo_xl
     if splits[-1] == "xl":
         return "_".join(splits[-2:])
+    # Special case for xlm_prophetnet
+    if splits[-1] == "prophetnet" and splits[-2] == "xlm":
+        return "_".join(splits[-2:])
     # Secial case for xlm_roberta
     if splits[-1] == "roberta" and splits[-2] == "xlm":
         return "_".join(splits[-2:])