From 207594be81b8e5a8589c8b11c3b236924555d806 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Wed, 22 Dec 2021 16:14:35 -0500
Subject: [PATCH] Convert rst files (#14888)

* Convert all tutorials and guides

* Convert all remaining rst to mdx

* Track and fix bad links
---
 .../{add_new_model.rst => add_new_model.mdx}  | 659 +++++++------
 docs/source/add_new_pipeline.mdx              | 138 +++
 docs/source/add_new_pipeline.rst              | 143 ---
 docs/source/bertology.mdx                     |  36 +
 docs/source/bertology.rst                     |  38 -
 docs/source/{community.md => community.mdx}   |   0
 docs/source/converting_tensorflow_models.mdx  | 162 ++++
 docs/source/converting_tensorflow_models.rst  | 181 ----
 docs/source/fast_tokenizers.mdx               |  70 ++
 docs/source/fast_tokenizers.rst               |  62 --
 docs/source/glossary.mdx                      | 297 ++++++
 docs/source/glossary.rst                      | 322 -------
 .../{installation.md => installation.mdx}     |   0
 docs/source/internal/file_utils.mdx           |  46 +
 docs/source/internal/file_utils.rst           |  54 --
 docs/source/internal/generation_utils.mdx     | 207 ++++
 docs/source/internal/generation_utils.rst     | 230 -----
 docs/source/internal/modeling_utils.mdx       |  82 ++
 docs/source/internal/modeling_utils.rst       |  97 --
 docs/source/internal/pipelines_utils.mdx      |  40 +
 docs/source/internal/pipelines_utils.rst      |  50 -
 docs/source/internal/tokenization_utils.mdx   |  38 +
 docs/source/internal/tokenization_utils.rst   |  45 -
 docs/source/internal/trainer_utils.mdx        |  43 +
 docs/source/internal/trainer_utils.rst        |  54 --
 docs/source/main_classes/callback.mdx         | 106 +++
 docs/source/main_classes/callback.rst         | 115 ---
 docs/source/main_classes/configuration.mdx    |  28 +
 docs/source/main_classes/configuration.rst    |  31 -
 docs/source/main_classes/data_collator.mdx    |  64 ++
 docs/source/main_classes/data_collator.rst    |  78 --
 .../source/main_classes/feature_extractor.mdx |  38 +
 .../source/main_classes/feature_extractor.rst |  48 -
 docs/source/main_classes/keras_callbacks.mdx  |  20 +
 docs/source/main_classes/keras_callbacks.rst  |  22 -
 docs/source/main_classes/logging.mdx          |  80 ++
 docs/source/main_classes/logging.rst          |  83 --
 docs/source/main_classes/model.mdx            |  99 ++
 docs/source/main_classes/model.rst            | 120 ---
 .../main_classes/optimizer_schedules.mdx      |  71 ++
 .../main_classes/optimizer_schedules.rst      |  97 --
 docs/source/main_classes/output.mdx           | 269 ++++++
 docs/source/main_classes/output.rst           | 412 --------
 docs/source/main_classes/pipelines.mdx        | 375 ++++++++
 docs/source/main_classes/pipelines.rst        | 407 --------
 docs/source/main_classes/processors.mdx       | 152 +++
 docs/source/main_classes/processors.rst       | 172 ----
 docs/source/main_classes/tokenizer.mdx        |  77 ++
 docs/source/main_classes/tokenizer.rst        |  78 --
 docs/source/{migration.md => migration.mdx}   |   0
 docs/source/model_doc/mbart.mdx               |   3 +-
 docs/source/model_doc/megatron_bert.mdx       | 128 +++
 docs/source/model_doc/megatron_bert.rst       | 154 ---
 .../{megatron_gpt2.rst => megatron_gpt2.mdx}  |  56 +-
 docs/source/model_doc/mluke.mdx               |  61 ++
 docs/source/model_doc/mluke.rst               |  66 --
 docs/source/model_doc/mobilebert.mdx          | 142 +++
 docs/source/model_doc/mobilebert.rst          | 190 ----
 docs/source/model_doc/mpnet.mdx               | 117 +++
 docs/source/model_doc/mpnet.rst               | 149 ---
 docs/source/model_doc/mt5.mdx                 |  98 ++
 docs/source/model_doc/mt5.rst                 | 129 ---
 docs/source/model_doc/pegasus.mdx             | 137 +++
 docs/source/model_doc/pegasus.rst             | 168 ----
 docs/source/model_doc/phobert.mdx             |  53 ++
 docs/source/model_doc/phobert.rst             |  60 --
 docs/source/model_doc/prophetnet.mdx          |  82 ++
 docs/source/model_doc/prophetnet.rst          | 106 ---
 docs/source/model_doc/qdqbert.mdx             | 158 +++
 docs/source/model_doc/qdqbert.rst             | 189 ----
 docs/source/model_doc/rag.mdx                 |  96 ++
 docs/source/model_doc/rag.rst                 | 118 ---
 docs/source/model_doc/reformer.mdx            | 177 ++++
 docs/source/model_doc/reformer.rst            | 211 ----
 docs/source/model_doc/rembert.mdx             | 128 +++
 docs/source/model_doc/rembert.rst             | 161 ----
 docs/source/model_doc/retribert.mdx           |  40 +
 docs/source/model_doc/retribert.rst           |  52 -
 docs/source/model_doc/roberta.mdx             | 162 ++++
 docs/source/model_doc/roberta.rst             | 210 ----
 docs/source/model_doc/roformer.mdx            | 125 +++
 docs/source/model_doc/roformer.rst            | 161 ----
 docs/source/model_doc/segformer.mdx           | 104 ++
 docs/source/model_doc/segformer.rst           | 132 ---
 docs/source/model_doc/sew.mdx                 |  57 ++
 docs/source/model_doc/sew.rst                 |  67 --
 docs/source/model_doc/sew_d.mdx               |  57 ++
 docs/source/model_doc/sew_d.rst               |  66 --
 docs/source/model_doc/speech_to_text.mdx      | 138 +++
 docs/source/model_doc/speech_to_text.rst      | 153 ---
 docs/source/model_doc/speech_to_text_2.mdx    | 116 +++
 docs/source/model_doc/speech_to_text_2.rst    | 123 ---
 .../source/model_doc/speechencoderdecoder.mdx |  35 +
 .../source/model_doc/speechencoderdecoder.rst |  40 -
 docs/source/model_doc/splinter.mdx            |  74 ++
 docs/source/model_doc/splinter.rst            |  87 --
 docs/source/model_doc/squeezebert.mdx         |  88 ++
 docs/source/model_doc/squeezebert.rst         | 114 ---
 docs/source/model_doc/t5.mdx                  | 342 +++++++
 docs/source/model_doc/t5.rst                  | 364 -------
 docs/source/model_doc/t5v1.1.mdx              |  61 ++
 docs/source/model_doc/t5v1.1.rst              |  66 --
 docs/source/model_doc/tapas.mdx               |   2 +-
 docs/source/model_doc/transformerxl.mdx       | 103 ++
 docs/source/model_doc/transformerxl.rst       | 130 ---
 docs/source/model_doc/trocr.mdx               |   2 +-
 docs/source/model_doc/unispeech.mdx           |  71 ++
 docs/source/model_doc/unispeech.rst           |  88 --
 docs/source/model_doc/unispeech_sat.mdx       |  86 ++
 docs/source/model_doc/unispeech_sat.rst       | 106 ---
 .../model_doc/vision_text_dual_encoder.mdx    |  43 +
 .../model_doc/vision_text_dual_encoder.rst    |  56 --
 .../source/model_doc/visionencoderdecoder.mdx |  40 +
 .../source/model_doc/visionencoderdecoder.rst |  48 -
 docs/source/model_doc/visual_bert.mdx         | 123 +++
 docs/source/model_doc/visual_bert.rst         | 143 ---
 docs/source/model_doc/vit.mdx                 | 127 +++
 docs/source/model_doc/vit.rst                 | 151 ---
 docs/source/model_doc/wav2vec2.mdx            | 141 +++
 docs/source/model_doc/wav2vec2.rst            | 169 ----
 docs/source/model_doc/wavlm.mdx               |  79 ++
 docs/source/model_doc/wavlm.rst               |  97 --
 docs/source/model_doc/xlm.mdx                 | 124 +++
 docs/source/model_doc/xlm.rst                 | 159 ----
 docs/source/model_doc/xlmprophetnet.mdx       |  68 ++
 docs/source/model_doc/xlmprophetnet.rst       |  87 --
 docs/source/model_doc/xlmroberta.mdx          | 126 +++
 docs/source/model_doc/xlmroberta.rst          | 161 ----
 docs/source/model_doc/xlnet.mdx               | 154 +++
 docs/source/model_doc/xlnet.rst               | 204 ----
 .../source/model_doc/{xls_r.rst => xls_r.mdx} |  32 +-
 .../{xlsr_wav2vec2.rst => xlsr_wav2vec2.mdx}  |  32 +-
 docs/source/model_sharing.mdx                 | 395 ++++++++
 docs/source/model_sharing.rst                 | 423 --------
 docs/source/model_summary.mdx                 | 793 +++++++++++++++
 docs/source/model_summary.rst                 | 901 ------------------
 .../{parallelism.md => parallelism.mdx}       |   0
 .../{performance.md => performance.mdx}       |   0
 docs/source/philosophy.mdx                    |  80 ++
 docs/source/philosophy.rst                    |  85 --
 docs/source/{pr_checks.md => pr_checks.mdx}   |   0
 docs/source/{sagemaker.md => sagemaker.mdx}   |   0
 docs/source/serialization.mdx                 | 430 +++++++++
 docs/source/serialization.rst                 | 432 ---------
 ...troubleshooting.md => troubleshooting.mdx} |   0
 src/transformers/commands/add_new_model.py    |   4 +-
 src/transformers/integrations.py              |   2 +-
 src/transformers/models/beit/modeling_beit.py |   8 +-
 .../models/bertweet/tokenization_bertweet.py  |   2 +-
 .../models/cpm/tokenization_cpm.py            |   2 +-
 .../models/cpm/tokenization_cpm_fast.py       |   2 +-
 src/transformers/models/fnet/modeling_fnet.py |   4 +-
 .../models/hubert/modeling_hubert.py          |   8 +-
 .../models/hubert/modeling_tf_hubert.py       |   8 +-
 .../models/ibert/modeling_ibert.py            |   4 +-
 .../models/layoutlm/modeling_layoutlm.py      |   6 +-
 .../models/layoutlmv2/modeling_layoutlmv2.py  |  14 +-
 .../models/prophetnet/modeling_prophetnet.py  |   2 +-
 src/transformers/models/rag/modeling_rag.py   |   4 +-
 .../models/rag/modeling_tf_rag.py             |   4 +-
 .../models/roformer/tokenization_roformer.py  |   2 +-
 src/transformers/models/sew/modeling_sew.py   |   8 +-
 .../models/sew_d/modeling_sew_d.py            |   8 +-
 .../models/splinter/modeling_splinter.py      |   4 +-
 .../models/unispeech/modeling_unispeech.py    |  12 +-
 .../unispeech_sat/modeling_unispeech_sat.py   |  12 +-
 .../visual_bert/modeling_visual_bert.py       |   4 +-
 .../models/wav2vec2/modeling_flax_wav2vec2.py |   8 +-
 .../models/wav2vec2/modeling_tf_wav2vec2.py   |   8 +-
 .../models/wav2vec2/modeling_wav2vec2.py      |  12 +-
 .../models/wavlm/modeling_wavlm.py            |  12 +-
 .../{{cookiecutter.lowercase_modelname}}.mdx  | 238 +++++
 .../{{cookiecutter.lowercase_modelname}}.rst  | 272 ------
 utils/check_table.py                          |   6 +-
 174 files changed, 8995 insertions(+), 10483 deletions(-)
 rename docs/source/{add_new_model.rst => add_new_model.mdx} (57%)
 create mode 100644 docs/source/add_new_pipeline.mdx
 delete mode 100644 docs/source/add_new_pipeline.rst
 create mode 100644 docs/source/bertology.mdx
 delete mode 100644 docs/source/bertology.rst
 rename docs/source/{community.md => community.mdx} (100%)
 create mode 100644 docs/source/converting_tensorflow_models.mdx
 delete mode 100644 docs/source/converting_tensorflow_models.rst
 create mode 100644 docs/source/fast_tokenizers.mdx
 delete mode 100644 docs/source/fast_tokenizers.rst
 create mode 100644 docs/source/glossary.mdx
 delete mode 100644 docs/source/glossary.rst
 rename docs/source/{installation.md => installation.mdx} (100%)
 create mode 100644 docs/source/internal/file_utils.mdx
 delete mode 100644 docs/source/internal/file_utils.rst
 create mode 100644 docs/source/internal/generation_utils.mdx
 delete mode 100644 docs/source/internal/generation_utils.rst
 create mode 100644 docs/source/internal/modeling_utils.mdx
 delete mode 100644 docs/source/internal/modeling_utils.rst
 create mode 100644 docs/source/internal/pipelines_utils.mdx
 delete mode 100644 docs/source/internal/pipelines_utils.rst
 create mode 100644 docs/source/internal/tokenization_utils.mdx
 delete mode 100644 docs/source/internal/tokenization_utils.rst
 create mode 100644 docs/source/internal/trainer_utils.mdx
 delete mode 100644 docs/source/internal/trainer_utils.rst
 create mode 100644 docs/source/main_classes/callback.mdx
 delete mode 100644 docs/source/main_classes/callback.rst
 create mode 100644 docs/source/main_classes/configuration.mdx
 delete mode 100644 docs/source/main_classes/configuration.rst
 create mode 100644 docs/source/main_classes/data_collator.mdx
 delete mode 100644 docs/source/main_classes/data_collator.rst
 create mode 100644 docs/source/main_classes/feature_extractor.mdx
 delete mode 100644 docs/source/main_classes/feature_extractor.rst
 create mode 100644 docs/source/main_classes/keras_callbacks.mdx
 delete mode 100644 docs/source/main_classes/keras_callbacks.rst
 create mode 100644 docs/source/main_classes/logging.mdx
 delete mode 100644 docs/source/main_classes/logging.rst
 create mode 100644 docs/source/main_classes/model.mdx
 delete mode 100644 docs/source/main_classes/model.rst
 create mode 100644 docs/source/main_classes/optimizer_schedules.mdx
 delete mode 100644 docs/source/main_classes/optimizer_schedules.rst
 create mode 100644 docs/source/main_classes/output.mdx
 delete mode 100644 docs/source/main_classes/output.rst
 create mode 100644 docs/source/main_classes/pipelines.mdx
 delete mode 100644 docs/source/main_classes/pipelines.rst
 create mode 100644 docs/source/main_classes/processors.mdx
 delete mode 100644 docs/source/main_classes/processors.rst
 create mode 100644 docs/source/main_classes/tokenizer.mdx
 delete mode 100644 docs/source/main_classes/tokenizer.rst
 rename docs/source/{migration.md => migration.mdx} (100%)
 create mode 100644 docs/source/model_doc/megatron_bert.mdx
 delete mode 100644 docs/source/model_doc/megatron_bert.rst
 rename docs/source/model_doc/{megatron_gpt2.rst => megatron_gpt2.mdx} (52%)
 create mode 100644 docs/source/model_doc/mluke.mdx
 delete mode 100644 docs/source/model_doc/mluke.rst
 create mode 100644 docs/source/model_doc/mobilebert.mdx
 delete mode 100644 docs/source/model_doc/mobilebert.rst
 create mode 100644 docs/source/model_doc/mpnet.mdx
 delete mode 100644 docs/source/model_doc/mpnet.rst
 create mode 100644 docs/source/model_doc/mt5.mdx
 delete mode 100644 docs/source/model_doc/mt5.rst
 create mode 100644 docs/source/model_doc/pegasus.mdx
 delete mode 100644 docs/source/model_doc/pegasus.rst
 create mode 100644 docs/source/model_doc/phobert.mdx
 delete mode 100644 docs/source/model_doc/phobert.rst
 create mode 100644 docs/source/model_doc/prophetnet.mdx
 delete mode 100644 docs/source/model_doc/prophetnet.rst
 create mode 100644 docs/source/model_doc/qdqbert.mdx
 delete mode 100644 docs/source/model_doc/qdqbert.rst
 create mode 100644 docs/source/model_doc/rag.mdx
 delete mode 100644 docs/source/model_doc/rag.rst
 create mode 100644 docs/source/model_doc/reformer.mdx
 delete mode 100644 docs/source/model_doc/reformer.rst
 create mode 100644 docs/source/model_doc/rembert.mdx
 delete mode 100644 docs/source/model_doc/rembert.rst
 create mode 100644 docs/source/model_doc/retribert.mdx
 delete mode 100644 docs/source/model_doc/retribert.rst
 create mode 100644 docs/source/model_doc/roberta.mdx
 delete mode 100644 docs/source/model_doc/roberta.rst
 create mode 100644 docs/source/model_doc/roformer.mdx
 delete mode 100644 docs/source/model_doc/roformer.rst
 create mode 100644 docs/source/model_doc/segformer.mdx
 delete mode 100644 docs/source/model_doc/segformer.rst
 create mode 100644 docs/source/model_doc/sew.mdx
 delete mode 100644 docs/source/model_doc/sew.rst
 create mode 100644 docs/source/model_doc/sew_d.mdx
 delete mode 100644 docs/source/model_doc/sew_d.rst
 create mode 100644 docs/source/model_doc/speech_to_text.mdx
 delete mode 100644 docs/source/model_doc/speech_to_text.rst
 create mode 100644 docs/source/model_doc/speech_to_text_2.mdx
 delete mode 100644 docs/source/model_doc/speech_to_text_2.rst
 create mode 100644 docs/source/model_doc/speechencoderdecoder.mdx
 delete mode 100644 docs/source/model_doc/speechencoderdecoder.rst
 create mode 100644 docs/source/model_doc/splinter.mdx
 delete mode 100644 docs/source/model_doc/splinter.rst
 create mode 100644 docs/source/model_doc/squeezebert.mdx
 delete mode 100644 docs/source/model_doc/squeezebert.rst
 create mode 100644 docs/source/model_doc/t5.mdx
 delete mode 100644 docs/source/model_doc/t5.rst
 create mode 100644 docs/source/model_doc/t5v1.1.mdx
 delete mode 100644 docs/source/model_doc/t5v1.1.rst
 create mode 100644 docs/source/model_doc/transformerxl.mdx
 delete mode 100644 docs/source/model_doc/transformerxl.rst
 create mode 100644 docs/source/model_doc/unispeech.mdx
 delete mode 100644 docs/source/model_doc/unispeech.rst
 create mode 100644 docs/source/model_doc/unispeech_sat.mdx
 delete mode 100644 docs/source/model_doc/unispeech_sat.rst
 create mode 100644 docs/source/model_doc/vision_text_dual_encoder.mdx
 delete mode 100644 docs/source/model_doc/vision_text_dual_encoder.rst
 create mode 100644 docs/source/model_doc/visionencoderdecoder.mdx
 delete mode 100644 docs/source/model_doc/visionencoderdecoder.rst
 create mode 100644 docs/source/model_doc/visual_bert.mdx
 delete mode 100644 docs/source/model_doc/visual_bert.rst
 create mode 100644 docs/source/model_doc/vit.mdx
 delete mode 100644 docs/source/model_doc/vit.rst
 create mode 100644 docs/source/model_doc/wav2vec2.mdx
 delete mode 100644 docs/source/model_doc/wav2vec2.rst
 create mode 100644 docs/source/model_doc/wavlm.mdx
 delete mode 100644 docs/source/model_doc/wavlm.rst
 create mode 100644 docs/source/model_doc/xlm.mdx
 delete mode 100644 docs/source/model_doc/xlm.rst
 create mode 100644 docs/source/model_doc/xlmprophetnet.mdx
 delete mode 100644 docs/source/model_doc/xlmprophetnet.rst
 create mode 100644 docs/source/model_doc/xlmroberta.mdx
 delete mode 100644 docs/source/model_doc/xlmroberta.rst
 create mode 100644 docs/source/model_doc/xlnet.mdx
 delete mode 100644 docs/source/model_doc/xlnet.rst
 rename docs/source/model_doc/{xls_r.rst => xls_r.mdx} (56%)
 rename docs/source/model_doc/{xlsr_wav2vec2.rst => xlsr_wav2vec2.mdx} (52%)
 create mode 100644 docs/source/model_sharing.mdx
 delete mode 100644 docs/source/model_sharing.rst
 create mode 100644 docs/source/model_summary.mdx
 delete mode 100644 docs/source/model_summary.rst
 rename docs/source/{parallelism.md => parallelism.mdx} (100%)
 rename docs/source/{performance.md => performance.mdx} (100%)
 create mode 100644 docs/source/philosophy.mdx
 delete mode 100644 docs/source/philosophy.rst
 rename docs/source/{pr_checks.md => pr_checks.mdx} (100%)
 rename docs/source/{sagemaker.md => sagemaker.mdx} (100%)
 create mode 100644 docs/source/serialization.mdx
 delete mode 100644 docs/source/serialization.rst
 rename docs/source/{troubleshooting.md => troubleshooting.mdx} (100%)
 create mode 100644 templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.mdx
 delete mode 100644 templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.rst

diff --git a/docs/source/add_new_model.rst b/docs/source/add_new_model.mdx
similarity index 57%
rename from docs/source/add_new_model.rst
rename to docs/source/add_new_model.mdx
index decd664f6c..9f49d78a58 100644
--- a/docs/source/add_new_model.rst
+++ b/docs/source/add_new_model.mdx
@@ -1,16 +1,15 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
 
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
 
-        http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+-->
 
-How to add a model to 🤗 Transformers?
-=======================================================================================================================
+# How to add a model to 🤗 Transformers?
 
 Adding a new model is often difficult and requires an in-depth knowledge of the 🤗 Transformers library and ideally also
 of the model's original repository. At Hugging Face, we are trying to empower the community more and more to add models
@@ -20,8 +19,7 @@ independently. Thus, for some new models that the community wants to be added to
 model to 🤗 Transformers.
 
 If this sounds like something you would be interested in, feel free to check out the currently open
-“calls-for-model-addition” `here
-<https://github.com/huggingface/transformers/tree/master/templates/adding_a_new_model/open_model_proposals/README.md>`__
+“calls-for-model-addition” [here](https://github.com/huggingface/transformers/tree/master/templates/adding_a_new_model/open_model_proposals/README.md)
 and to contact us.
 
 If selected, you will then work closely with one member of the Hugging Face team to integrate the model into 🤗
@@ -31,31 +29,28 @@ more importantly, you will have made a major open-source contribution to 🤗 Tr
 -  get insights into open-source best practices
 -  understand the design principles of one of the most popular NLP libraries
 -  learn how to do efficiently test large NLP models
--  learn how to integrate Python utilities like ``black``, ``isort``, ``make fix-copies`` into a library to always
-   ensure clean and readable code
+-  learn how to integrate Python utilities like `black`, `isort`, `make fix-copies` into a library to always
+  ensure clean and readable code
 
 We are also more than happy if you want to add a model that cannot be found in the “calls-for-model-addition” folder.
 The following sections explain in detail how to add a new model. It might also be very helpful to check out already
-added models to see if those resemble the model you would like to add `here
-<https://github.com/huggingface/transformers/pulls?q=is%3Apr+label%3A%22PR+for+Model+Addition%22+is%3Aclosed>`__.
+added models to see if those resemble the model you would like to add [here](https://github.com/huggingface/transformers/pulls?q=is%3Apr+label%3A%22PR+for+Model+Addition%22+is%3Aclosed).
 
 To start, let's try to get a general overview of the Transformers library.
 
-General overview of 🤗 Transformers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+## General overview of 🤗 Transformers
 
 First, you should get a general overview of 🤗 Transformers. 🤗 Transformers is a very opinionated library, so there is a
 chance that you don't agree with some of the library's philosophies or design choices. From our experience, however, we
 found that the fundamental design choices and philosophies of the library are crucial to efficiently scale 🤗
 Transformers while keeping maintenance costs at a reasonable level.
 
-A good first starting point to better understand the library is to read the :doc:`documentation of our philosophy
-<philosophy>`. As a result of our way of working, there are some choices that we try to apply to all models:
+A good first starting point to better understand the library is to read the [documentation of our philosophy](philosophy). As a result of our way of working, there are some choices that we try to apply to all models:
 
--  Composition is generally favored over-abstraction
--  Duplicating code is not always bad if it strongly improves the readability or accessibility of a model
--  Model files are as self-contained as possible so that when you read the code of a specific model, you ideally only
-   have to look into the respective ``modeling_....py`` file.
+- Composition is generally favored over-abstraction
+- Duplicating code is not always bad if it strongly improves the readability or accessibility of a model
+- Model files are as self-contained as possible so that when you read the code of a specific model, you ideally only
+  have to look into the respective `modeling_....py` file.
 
 In our opinion, the library's code is not just a means to provide a product, *e.g.* the ability to use BERT for
 inference, but also as the very product that we want to improve. Hence, when adding a model, the user is not only the
@@ -63,72 +58,68 @@ person that will use your model, but also everybody that will read, try to under
 
 With this in mind, let's go a bit deeper into the general library design.
 
-Overview of models
------------------------------------------------------------------------------------------------------------------------
+### Overview of models
 
 To successfully add a model, it is important to understand the interaction between your model and its config,
-:class:`~transformers.PreTrainedModel`, and :class:`~transformers.PretrainedConfig`. For exemplary purposes, we will
-call the model to be added to 🤗 Transformers ``BrandNewBert``.
+[`PreTrainedModel`], and [`PretrainedConfig`]. For exemplary purposes, we will
+call the model to be added to 🤗 Transformers `BrandNewBert`.
 
 Let's take a look:
 
-.. image:: https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_overview.png
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_overview.png"/>
 
 As you can see, we do make use of inheritance in 🤗 Transformers, but we keep the level of abstraction to an absolute
-minimum. There are never more than two levels of abstraction for any model in the library. :obj:`BrandNewBertModel`
-inherits from :obj:`BrandNewBertPreTrainedModel` which in turn inherits from :class:`~transformers.PreTrainedModel` and
+minimum. There are never more than two levels of abstraction for any model in the library. `BrandNewBertModel`
+inherits from `BrandNewBertPreTrainedModel` which in turn inherits from [`PreTrainedModel`] and
 that's it. As a general rule, we want to make sure that a new model only depends on
-:class:`~transformers.PreTrainedModel`. The important functionalities that are automatically provided to every new
-model are :meth:`~transformers.PreTrainedModel.from_pretrained` and
-:meth:`~transformers.PreTrainedModel.save_pretrained`, which are used for serialization and deserialization. All of the
-other important functionalities, such as :meth:`BrandNewBertModel.forward` should be completely defined in the new
-``modeling_brand_new_bert.py`` script. Next, we want to make sure that a model with a specific head layer, such as
-:obj:`BrandNewBertForMaskedLM` does not inherit from :obj:`BrandNewBertModel`, but rather uses :obj:`BrandNewBertModel`
+[`PreTrainedModel`]. The important functionalities that are automatically provided to every new
+model are [`~PreTrainedModel.from_pretrained`] and
+[`~PreTrainedModel.save_pretrained`], which are used for serialization and deserialization. All of the
+other important functionalities, such as `BrandNewBertModel.forward` should be completely defined in the new
+`modeling_brand_new_bert.py` script. Next, we want to make sure that a model with a specific head layer, such as
+`BrandNewBertForMaskedLM` does not inherit from `BrandNewBertModel`, but rather uses `BrandNewBertModel`
 as a component that can be called in its forward pass to keep the level of abstraction low. Every new model requires a
-configuration class, called :obj:`BrandNewBertConfig`. This configuration is always stored as an attribute in
-:class:`~transformers.PreTrainedModel`, and thus can be accessed via the ``config`` attribute for all classes
-inheriting from :obj:`BrandNewBertPreTrainedModel`:
+configuration class, called `BrandNewBertConfig`. This configuration is always stored as an attribute in
+[`PreTrainedModel`], and thus can be accessed via the `config` attribute for all classes
+inheriting from `BrandNewBertPreTrainedModel`:
 
-   .. code:: python
-
-      model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert")
-      model.config  # model has access to its config
+```python
+model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert")
+model.config  # model has access to its config
+```
 
 Similar to the model, the configuration inherits basic serialization and deserialization functionalities from
-:class:`~transformers.PretrainedConfig`. Note that the configuration and the model are always serialized into two
-different formats - the model to a `pytorch_model.bin` file and the configuration to a `config.json` file. Calling
-:meth:`~transformers.PreTrainedModel.save_pretrained` will automatically call
-:meth:`~transformers.PretrainedConfig.save_pretrained`, so that both model and configuration are saved.
+[`PretrainedConfig`]. Note that the configuration and the model are always serialized into two
+different formats - the model to a *pytorch_model.bin* file and the configuration to a *config.json* file. Calling
+[`~PreTrainedModel.save_pretrained`] will automatically call
+[`~PretrainedConfig.save_pretrained`], so that both model and configuration are saved.
 
 
-Overview of tokenizers
------------------------------------------------------------------------------------------------------------------------
+### Overview of tokenizers
 
 Not quite ready yet :-( This section will be added soon!
 
-Step-by-step recipe to add a model to 🤗 Transformers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+## Step-by-step recipe to add a model to 🤗 Transformers
 
 Everyone has different preferences of how to port a model so it can be very helpful for you to take a look at summaries
 of how other contributors ported models to Hugging Face. Here is a list of community blog posts on how to port a model:
 
-1. `Porting GPT2 Model <https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28>`__ by `Thomas
-   <https://huggingface.co/thomwolf>`__
-2. `Porting WMT19 MT Model <https://huggingface.co/blog/porting-fsmt>`__ by `Stas <https://huggingface.co/stas>`__
+1. [Porting GPT2 Model](https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28) by [Thomas](https://huggingface.co/thomwolf)
+2. [Porting WMT19 MT Model](https://huggingface.co/blog/porting-fsmt) by [Stas](https://huggingface.co/stas)
 
 From experience, we can tell you that the most important things to keep in mind when adding a model are:
 
 -  Don't reinvent the wheel! Most parts of the code you will add for the new 🤗 Transformers model already exist
-   somewhere in 🤗 Transformers. Take some time to find similar, already existing models and tokenizers you can copy
-   from. `grep <https://www.gnu.org/software/grep/>`__ and `rg <https://github.com/BurntSushi/ripgrep>`__ are your
-   friends. Note that it might very well happen that your model's tokenizer is based on one model implementation, and
-   your model's modeling code on another one. *E.g.* FSMT's modeling code is based on BART, while FSMT's tokenizer code
-   is based on XLM.
+  somewhere in 🤗 Transformers. Take some time to find similar, already existing models and tokenizers you can copy
+  from. [grep](https://www.gnu.org/software/grep/) and [rg](https://github.com/BurntSushi/ripgrep) are your
+  friends. Note that it might very well happen that your model's tokenizer is based on one model implementation, and
+  your model's modeling code on another one. *E.g.* FSMT's modeling code is based on BART, while FSMT's tokenizer code
+  is based on XLM.
 -  It's more of an engineering challenge than a scientific challenge. You should spend more time on creating an
-   efficient debugging environment than trying to understand all theoretical aspects of the model in the paper.
+  efficient debugging environment than trying to understand all theoretical aspects of the model in the paper.
 -  Ask for help, when you're stuck! Models are the core component of 🤗 Transformers so that we at Hugging Face are more
-   than happy to help you at every step to add your model. Don't hesitate to ask if you notice you are not making
-   progress.
+  than happy to help you at every step to add your model. Don't hesitate to ask if you notice you are not making
+  progress.
 
 In the following, we try to give you a general recipe that we found most useful when porting a model to 🤗 Transformers.
 
@@ -150,14 +141,13 @@ List:
 -  13. ☐ Submitted the pull request
 -  14. ☐ (Optional) Added a demo notebook
 
-To begin with, we usually recommend to start by getting a good theoretical understanding of ``BrandNewBert``. However,
+To begin with, we usually recommend to start by getting a good theoretical understanding of `BrandNewBert`. However,
 if you prefer to understand the theoretical aspects of the model *on-the-job*, then it is totally fine to directly dive
-into the ``BrandNewBert``'s code-base. This option might suit you better, if your engineering skills are better than
-your theoretical skill, if you have trouble understanding ``BrandNewBert``'s paper, or if you just enjoy programming
+into the `BrandNewBert`'s code-base. This option might suit you better, if your engineering skills are better than
+your theoretical skill, if you have trouble understanding `BrandNewBert`'s paper, or if you just enjoy programming
 much more than reading scientific papers.
 
-1. (Optional) Theoretical aspects of BrandNewBert
------------------------------------------------------------------------------------------------------------------------
+### 1. (Optional) Theoretical aspects of BrandNewBert
 
 You should take some time to read *BrandNewBert's* paper, if such descriptive work exists. There might be large
 sections of the paper that are difficult to understand. If this is the case, this is fine - don't worry! The goal is
@@ -166,46 +156,45 @@ effectively re-implement the model in 🤗 Transformers. That being said, you do
 theoretical aspects, but rather focus on the practical ones, namely:
 
 -  What type of model is *brand_new_bert*? BERT-like encoder-only model? GPT2-like decoder-only model? BART-like
-   encoder-decoder model? Look at the :doc:`model_summary` if you're not familiar with the differences between those.
+  encoder-decoder model? Look at the [model_summary](model_summary) if you're not familiar with the differences between those.
 -  What are the applications of *brand_new_bert*? Text classification? Text generation? Seq2Seq tasks, *e.g.,*
-   summarization?
+  summarization?
 -  What is the novel feature of the model making it different from BERT/GPT-2/BART?
--  Which of the already existing `🤗 Transformers models <https://huggingface.co/transformers/#contents>`__ is most
-   similar to *brand_new_bert*?
+-  Which of the already existing [🤗 Transformers models](https://huggingface.co/transformers/#contents) is most
+  similar to *brand_new_bert*?
 -  What type of tokenizer is used? A sentencepiece tokenizer? Word piece tokenizer? Is it the same tokenizer as used
-   for BERT or BART?
+  for BERT or BART?
 
 After you feel like you have gotten a good overview of the architecture of the model, you might want to write to the
 Hugging Face team with any questions you might have. This might include questions regarding the model's architecture,
 its attention layer, etc. We will be more than happy to help you.
 
-2. Next prepare your environment
------------------------------------------------------------------------------------------------------------------------
+### 2. Next prepare your environment
 
-1. Fork the `repository <https://github.com/huggingface/transformers>`__ by clicking on the ‘Fork' button on the
+1. Fork the [repository](https://github.com/huggingface/transformers) by clicking on the ‘Fork' button on the
    repository's page. This creates a copy of the code under your GitHub user account.
 
-2. Clone your ``transformers`` fork to your local disk, and add the base repository as a remote:
+2. Clone your `transformers` fork to your local disk, and add the base repository as a remote:
 
-   .. code:: bash
-
-      git clone https://github.com/[your Github handle]/transformers.git
-      cd transformers
-      git remote add upstream https://github.com/huggingface/transformers.git
+```bash
+git clone https://github.com/[your Github handle]/transformers.git
+cd transformers
+git remote add upstream https://github.com/huggingface/transformers.git
+```
 
 3. Set up a development environment, for instance by running the following command:
 
-   .. code:: bash
-
-      python -m venv .env
-      source .env/bin/activate
-      pip install -e ".[dev]"
+```bash
+python -m venv .env
+source .env/bin/activate
+pip install -e ".[dev]"
+```
 
 and return to the parent directory
 
-.. code:: bash
-
-   cd ..
+```bash
+cd ..
+```
 
 4. We recommend adding the PyTorch version of *brand_new_bert* to Transformers. To install PyTorch, please follow the
    instructions on https://pytorch.org/get-started/locally/.
@@ -214,16 +203,15 @@ and return to the parent directory
 
 5. To port *brand_new_bert*, you will also need access to its original repository:
 
-.. code:: bash
-
-   git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git 
-   cd brand_new_bert
-   pip install -e .
+```bash
+git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git 
+cd brand_new_bert
+pip install -e .
+```
 
 Now you have set up a development environment to port *brand_new_bert* to 🤗 Transformers.
 
-3.-4. Run a pretrained checkpoint using the original repository
------------------------------------------------------------------------------------------------------------------------
+### 3.-4. Run a pretrained checkpoint using the original repository
 
 At first, you will work on the original *brand_new_bert* repository. Often, the original implementation is very
 “researchy”. Meaning that documentation might be lacking and the code can be difficult to understand. But this should
@@ -238,16 +226,16 @@ Successfully running the official pretrained model in the original repository is
 From our experience, it is very important to spend some time getting familiar with the original code-base. You need to
 figure out the following:
 
--  Where to find the pretrained weights?
--  How to load the pretrained weights into the corresponding model?
--  How to run the tokenizer independently from the model?
--  Trace one forward pass so that you know which classes and functions are required for a simple forward pass. Usually,
-   you only have to reimplement those functions.
--  Be able to locate the important components of the model: Where is the model's class? Are there model sub-classes,
-   *e.g.* EncoderModel, DecoderModel? Where is the self-attention layer? Are there multiple different attention layers,
-   *e.g.* *self-attention*, *cross-attention*...?
--  How can you debug the model in the original environment of the repo? Do you have to add `print` statements, can you
-   work with an interactive debugger like `ipdb`, or should you use an efficient IDE to debug the model, like PyCharm?
+- Where to find the pretrained weights?
+- How to load the pretrained weights into the corresponding model?
+- How to run the tokenizer independently from the model?
+- Trace one forward pass so that you know which classes and functions are required for a simple forward pass. Usually,
+  you only have to reimplement those functions.
+- Be able to locate the important components of the model: Where is the model's class? Are there model sub-classes,
+  *e.g.* EncoderModel, DecoderModel? Where is the self-attention layer? Are there multiple different attention layers,
+  *e.g.* *self-attention*, *cross-attention*...?
+- How can you debug the model in the original environment of the repo? Do you have to add *print* statements, can you
+  work with an interactive debugger like *ipdb*, or should you use an efficient IDE to debug the model, like PyCharm?
 
 It is very important that before you start the porting process, that you can **efficiently** debug code in the original
 repository! Also, remember that you are working with an open-source library, so do not hesitate to open an issue, or
@@ -262,8 +250,7 @@ model also works as expected on GPU.
 
 In general, there are two possible debugging environments for running the original model
 
--  `Jupyter notebooks <https://jupyter.org/>`__ / `google colab
-   <https://colab.research.google.com/notebooks/intro.ipynb>`__
+-  [Jupyter notebooks](https://jupyter.org/) / [google colab](https://colab.research.google.com/notebooks/intro.ipynb)
 -  Local python scripts.
 
 Jupyter notebooks have the advantage that they allow for cell-by-cell execution which can be helpful to better split
@@ -273,24 +260,24 @@ Face team for help. If you are familiar with Jupiter notebooks, we strongly reco
 
 The obvious disadvantage of Jupyter notebooks is that if you are not used to working with them you will have to spend
 some time adjusting to the new programming environment and that you might not be able to use your known debugging tools
-anymore, like ``ipdb``.
+anymore, like `ipdb`.
 
 For each code-base, a good first step is always to load a **small** pretrained checkpoint and to be able to reproduce a
 single forward pass using a dummy integer vector of input IDs as an input. Such a script could look like this (in
 pseudocode):
 
-.. code:: bash
-
-   model = BrandNewBertModel.load_pretrained_checkpoint(/path/to/checkpoint/)
-   input_ids = [0, 4, 5, 2, 3, 7, 9]  # vector of input ids
-   original_output = model.predict(input_ids)
+```python
+model = BrandNewBertModel.load_pretrained_checkpoint(/path/to/checkpoint/)
+input_ids = [0, 4, 5, 2, 3, 7, 9]  # vector of input ids
+original_output = model.predict(input_ids)
+```
 
 Next, regarding the debugging strategy, there are generally a few from which to choose from:
 
--  Decompose the original model into many small testable components and run a forward pass on each of those for
-   verification
--  Decompose the original model only into the original *tokenizer* and the original *model*, run a forward pass on
-   those, and use intermediate print statements or breakpoints for verification
+- Decompose the original model into many small testable components and run a forward pass on each of those for
+  verification
+- Decompose the original model only into the original *tokenizer* and the original *model*, run a forward pass on
+  those, and use intermediate print statements or breakpoints for verification
 
 Again, it is up to you which strategy to choose. Often, one or the other is advantageous depending on the original code
 base.
@@ -309,12 +296,12 @@ to taking the more difficult road in the beginning:
 - at a later stage those component-by-component tests help you to ensure that no regression occurs as you continue
   changing your code
 
-`Lysandre's <https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed>`__ integration checks for ELECTRA
+[Lysandre's](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed) integration checks for ELECTRA
 gives a nice example of how this can be done.
 
 However, if the original code-base is very complex or only allows intermediate components to be run in a compiled mode,
 it might be too time-consuming or even impossible to separate the model into smaller testable sub-components. A good
-example is `T5's MeshTensorFlow <https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow>`__ library which is
+example is [T5's MeshTensorFlow](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow) library which is
 very complex and does not offer a simple way to decompose the model into its sub-components. For such libraries, one
 often relies on verifying print statements.
 
@@ -324,27 +311,27 @@ starting layers first and the ending layers last.
 It is recommended that you retrieve the output, either by print statements or sub-component functions, of the following
 layers in the following order:
 
-1.  Retrieve the input IDs passed to the model
-2.  Retrieve the word embeddings
-3.  Retrieve the input of the first Transformer layer
-4.  Retrieve the output of the first Transformer layer
-5.  Retrieve the output of the following n - 1 Transformer layers
-6.  Retrieve the output of the whole BrandNewBert Model
+1. Retrieve the input IDs passed to the model
+2. Retrieve the word embeddings
+3. Retrieve the input of the first Transformer layer
+4. Retrieve the output of the first Transformer layer
+5. Retrieve the output of the following n - 1 Transformer layers
+6. Retrieve the output of the whole BrandNewBert Model
 
-Input IDs should thereby consists of an array of integers, *e.g.* ``input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]``
+Input IDs should thereby consists of an array of integers, *e.g.* `input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]`
 
 The outputs of the following layers often consist of multi-dimensional float arrays and can look like this:
 
-.. code:: bash
-
-   [[
-    [-0.1465, -0.6501,  0.1993,  ...,  0.1451,  0.3430,  0.6024],
-    [-0.4417, -0.5920,  0.3450,  ..., -0.3062,  0.6182,  0.7132],
-    [-0.5009, -0.7122,  0.4548,  ..., -0.3662,  0.6091,  0.7648],
-    ...,
-    [-0.5613, -0.6332,  0.4324,  ..., -0.3792,  0.7372,  0.9288],
-    [-0.5416, -0.6345,  0.4180,  ..., -0.3564,  0.6992,  0.9191],
-    [-0.5334, -0.6403,  0.4271,  ..., -0.3339,  0.6533,  0.8694]]],
+```
+[[
+ [-0.1465, -0.6501,  0.1993,  ...,  0.1451,  0.3430,  0.6024],
+ [-0.4417, -0.5920,  0.3450,  ..., -0.3062,  0.6182,  0.7132],
+ [-0.5009, -0.7122,  0.4548,  ..., -0.3662,  0.6091,  0.7648],
+ ...,
+ [-0.5613, -0.6332,  0.4324,  ..., -0.3792,  0.7372,  0.9288],
+ [-0.5416, -0.6345,  0.4180,  ..., -0.3564,  0.6992,  0.9191],
+ [-0.5334, -0.6403,  0.4271,  ..., -0.3339,  0.6533,  0.8694]]],
+```
 
 We expect that every model added to 🤗 Transformers passes a couple of integration tests, meaning that the original
 model and the reimplemented version in 🤗 Transformers have to give the exact same output up to a precision of 0.001!
@@ -355,54 +342,52 @@ outputs of the 🤗 Transformers version multiple times against the intermediate
 *brand_new_bert* in which case an **efficient** debugging environment of the original repository is absolutely
 important. Here is some advice is to make your debugging environment as efficient as possible.
 
--  Find the best way of debugging intermediate results. Is the original repository written in PyTorch? Then you should
-   probably take the time to write a longer script that decomposes the original model into smaller sub-components to
-   retrieve intermediate values. Is the original repository written in Tensorflow 1? Then you might have to rely on
-   TensorFlow print operations like `tf.print <https://www.tensorflow.org/api_docs/python/tf/print>`__ to output
-   intermediate values. Is the original repository written in Jax? Then make sure that the model is **not jitted** when
-   running the forward pass, *e.g.* check-out `this link <https://github.com/google/jax/issues/196>`__.
--  Use the smallest pretrained checkpoint you can find. The smaller the checkpoint, the faster your debug cycle
-   becomes. It is not efficient if your pretrained model is so big that your forward pass takes more than 10 seconds.
-   In case only very large checkpoints are available, it might make more sense to create a dummy model in the new
-   environment with randomly initialized weights and save those weights for comparison with the 🤗 Transformers version
-   of your model
--  Make sure you are using the easiest way of calling a forward pass in the original repository. Ideally, you want to
-   find the function in the original repository that **only** calls a single forward pass, *i.e.* that is often called
-   ``predict``, ``evaluate``, ``forward`` or ``__call__``. You don't want to debug a function that calls ``forward``
-   multiple times, *e.g.* to generate text, like ``autoregressive_sample``, ``generate``.
--  Try to separate the tokenization from the model's `forward` pass. If the original repository shows examples where
-   you have to input a string, then try to find out where in the forward call the string input is changed to input ids
-   and start from this point. This might mean that you have to possibly write a small script yourself or change the
-   original code so that you can directly input the ids instead of an input string.
--  Make sure that the model in your debugging setup is **not** in training mode, which often causes the model to yield
-   random outputs due to multiple dropout layers in the model. Make sure that the forward pass in your debugging
-   environment is **deterministic** so that the dropout layers are not used. Or use `transformers.file_utils.set_seed`
-   if the old and new implementations are in the same framework.
+- Find the best way of debugging intermediate results. Is the original repository written in PyTorch? Then you should
+  probably take the time to write a longer script that decomposes the original model into smaller sub-components to
+  retrieve intermediate values. Is the original repository written in Tensorflow 1? Then you might have to rely on
+  TensorFlow print operations like [tf.print](https://www.tensorflow.org/api_docs/python/tf/print) to output
+  intermediate values. Is the original repository written in Jax? Then make sure that the model is **not jitted** when
+  running the forward pass, *e.g.* check-out [this link](https://github.com/google/jax/issues/196).
+- Use the smallest pretrained checkpoint you can find. The smaller the checkpoint, the faster your debug cycle
+  becomes. It is not efficient if your pretrained model is so big that your forward pass takes more than 10 seconds.
+  In case only very large checkpoints are available, it might make more sense to create a dummy model in the new
+  environment with randomly initialized weights and save those weights for comparison with the 🤗 Transformers version
+  of your model
+- Make sure you are using the easiest way of calling a forward pass in the original repository. Ideally, you want to
+  find the function in the original repository that **only** calls a single forward pass, *i.e.* that is often called
+  `predict`, `evaluate`, `forward` or `__call__`. You don't want to debug a function that calls `forward`
+  multiple times, *e.g.* to generate text, like `autoregressive_sample`, `generate`.
+- Try to separate the tokenization from the model's *forward* pass. If the original repository shows examples where
+  you have to input a string, then try to find out where in the forward call the string input is changed to input ids
+  and start from this point. This might mean that you have to possibly write a small script yourself or change the
+  original code so that you can directly input the ids instead of an input string.
+- Make sure that the model in your debugging setup is **not** in training mode, which often causes the model to yield
+  random outputs due to multiple dropout layers in the model. Make sure that the forward pass in your debugging
+  environment is **deterministic** so that the dropout layers are not used. Or use *transformers.file_utils.set_seed*
+  if the old and new implementations are in the same framework.
 
 The following section gives you more specific details/tips on how you can do this for *brand_new_bert*.
 
-5.-14. Port BrandNewBert to 🤗 Transformers
------------------------------------------------------------------------------------------------------------------------
+### 5.-14. Port BrandNewBert to 🤗 Transformers
 
 Next, you can finally start adding new code to 🤗 Transformers. Go into the clone of your 🤗 Transformers' fork:
 
-::
-
-    cd transformers
+```bash
+cd transformers
+```
 
 In the special case that you are adding a model whose architecture exactly matches the model architecture of an
-existing model you only have to add a conversion script as described in `this section <#write-a-conversion-script>`__.
+existing model you only have to add a conversion script as described in [this section](#write-a-conversion-script).
 In this case, you can just re-use the whole model architecture of the already existing model.
 
 Otherwise, let's start generating a new model with the amazing Cookiecutter!
 
 **Use the Cookiecutter to automatically generate the model's code**
 
-To begin with head over to the `🤗 Transformers templates
-<https://github.com/huggingface/transformers/tree/master/templates/adding_a_new_model>`__ to make use of our
-``cookiecutter`` implementation to automatically generate all the relevant files for your model. Again, we recommend
-only adding the PyTorch version of the model at first. Make sure you follow the instructions of the ``README.md`` on
-the `🤗 Transformers templates <https://github.com/huggingface/transformers/tree/master/templates/adding_a_new_model>`__
+To begin with head over to the [🤗 Transformers templates](https://github.com/huggingface/transformers/tree/master/templates/adding_a_new_model) to make use of our
+`cookiecutter` implementation to automatically generate all the relevant files for your model. Again, we recommend
+only adding the PyTorch version of the model at first. Make sure you follow the instructions of the `README.md` on
+the [🤗 Transformers templates](https://github.com/huggingface/transformers/tree/master/templates/adding_a_new_model)
 carefully.
 
 **Open a Pull Request on the main huggingface/transformers repo**
@@ -415,29 +400,29 @@ You should do the following:
 
 1. Create a branch with a descriptive name from your master branch
 
-::
-
-    git checkout -b add_brand_new_bert
+```bash
+git checkout -b add_brand_new_bert
+```
 
 2. Commit the automatically generated code:
 
-::
-
-    git add .
-    git commit
+```bash
+git add .
+git commit
+```
 
 3. Fetch and rebase to current master
 
-::
-
-    git fetch upstream
-    git rebase upstream/master
+```bash
+git fetch upstream
+git rebase upstream/master
+```
 
 4. Push the changes to your account using:
 
-::
-
-    git push -u origin a-descriptive-name-for-my-changes
+```bash
+git push -u origin a-descriptive-name-for-my-changes
+```
 
 5. Once you are satisfied, go to the webpage of your fork on GitHub. Click on “Pull request”. Make sure to add the
    GitHub handle of some members of the Hugging Face team as reviewers, so that the Hugging Face team gets notified for
@@ -449,10 +434,10 @@ In the following, whenever you have done some progress, don't forget to commit y
 that it shows in the pull request. Additionally, you should make sure to update your work with the current master from
 time to time by doing:
 
-::
-
-    git fetch upstream
-    git merge upstream/master
+```bash
+git fetch upstream
+git merge upstream/master
+```
 
 In general, all questions you might have regarding the model or your implementation should be asked in your PR and
 discussed/solved in the PR. This way, the Hugging Face team will always be notified when you are committing new code or
@@ -470,11 +455,11 @@ Hugging Face team by Slack or email.
 **5. Adapt the generated models code for brand_new_bert**
 
 At first, we will focus only on the model itself and not care about the tokenizer. All the relevant code should be
-found in the generated files ``src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`` and
-``src/transformers/models/brand_new_bert/configuration_brand_new_bert.py``.
+found in the generated files `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` and
+`src/transformers/models/brand_new_bert/configuration_brand_new_bert.py`.
 
 Now you can finally start coding :). The generated code in
-``src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`` will either have the same architecture as BERT if
+`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` will either have the same architecture as BERT if
 it's an encoder-only model or BART if it's an encoder-decoder model. At this point, you should remind yourself what
 you've learned in the beginning about the theoretical aspects of the model: *How is the model different from BERT or
 BART?*". Implement those changes which often means to change the *self-attention* layer, the order of the normalization
@@ -483,19 +468,19 @@ get a better feeling of how your model should be implemented.
 
 **Note** that at this point, you don't have to be very sure that your code is fully correct or clean. Rather, it is
 advised to add a first *unclean*, copy-pasted version of the original code to
-``src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`` until you feel like all the necessary code is
+`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` until you feel like all the necessary code is
 added. From our experience, it is much more efficient to quickly add a first version of the required code and
 improve/correct the code iteratively with the conversion script as described in the next section. The only thing that
 has to work at this point is that you can instantiate the 🤗 Transformers implementation of *brand_new_bert*, *i.e.* the
 following command should work:
 
-.. code:: python
+```python
+from transformers import BrandNewBertModel, BrandNewBertConfig
+model = BrandNewBertModel(BrandNewBertConfig())
+```
 
-   from transformers import BrandNewBertModel, BrandNewBertConfig
-   model = BrandNewBertModel(BrandNewBertConfig())
-
-The above command will create a model according to the default parameters as defined in ``BrandNewBertConfig()`` with
-random weights, thus making sure that the ``init()`` methods of all components works.
+The above command will create a model according to the default parameters as defined in `BrandNewBertConfig()` with
+random weights, thus making sure that the `init()` methods of all components works.
 
 **6. Write a conversion script**
 
@@ -507,177 +492,173 @@ the same framework as *brand_new_bert*. Usually, it is enough to copy an already
 slightly adapt it for your use case. Don't hesitate to ask the Hugging Face team to point you to a similar already
 existing conversion script for your model.
 
--  If you are porting a model from TensorFlow to PyTorch, a good starting point might be BERT's conversion script `here
-   <https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91>`__
--  If you are porting a model from PyTorch to PyTorch, a good starting point might be BART's conversion script `here
-   <https://github.com/huggingface/transformers/blob/master/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py>`__
+- If you are porting a model from TensorFlow to PyTorch, a good starting point might be BERT's conversion script [here](https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91)
+- If you are porting a model from PyTorch to PyTorch, a good starting point might be BART's conversion script [here](https://github.com/huggingface/transformers/blob/master/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py)
 
 In the following, we'll quickly explain how PyTorch models store layer weights and define layer names. In PyTorch, the
 name of a layer is defined by the name of the class attribute you give the layer. Let's define a dummy model in
-PyTorch, called ``SimpleModel`` as follows:
+PyTorch, called `SimpleModel` as follows:
 
-.. code:: python
+```python
+from torch import nn
 
-   from torch import nn
+class SimpleModel(nn.Module):
+    def __init__(self):
+            super().__init__()
+            self.dense = nn.Linear(10, 10)
+            self.intermediate = nn.Linear(10, 10)
+            self.layer_norm = nn.LayerNorm(10)
+```
 
-   class SimpleModel(nn.Module):
-       def __init__(self):
-               super().__init__()
-               self.dense = nn.Linear(10, 10)
-               self.intermediate = nn.Linear(10, 10)
-               self.layer_norm = nn.LayerNorm(10)
+Now we can create an instance of this model definition which will fill all weights: `dense`, `intermediate`,
+`layer_norm` with random weights. We can print the model to see its architecture
 
-Now we can create an instance of this model definition which will fill all weights: ``dense``, ``intermediate``,
-``layer_norm`` with random weights. We can print the model to see its architecture
+```python
+model = SimpleModel()
 
-.. code:: python
-
-   model = SimpleModel()
-
-   print(model)
+print(model)
+```
 
 This will print out the following:
 
-.. code:: bash
-
-   SimpleModel(
-     (dense): Linear(in_features=10, out_features=10, bias=True)
-     (intermediate): Linear(in_features=10, out_features=10, bias=True)
-     (layer_norm): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
-   )
+```
+SimpleModel(
+  (dense): Linear(in_features=10, out_features=10, bias=True)
+  (intermediate): Linear(in_features=10, out_features=10, bias=True)
+  (layer_norm): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
+)
+```
 
 We can see that the layer names are defined by the name of the class attribute in PyTorch. You can print out the weight
 values of a specific layer:
 
-.. code:: python
-
-   print(model.dense.weight.data)
+```python
+print(model.dense.weight.data)
+```
 
 to see that the weights were randomly initialized
 
-.. code:: bash
-
-   tensor([[-0.0818,  0.2207, -0.0749, -0.0030,  0.0045, -0.1569, -0.1598,  0.0212,
-            -0.2077,  0.2157],
-           [ 0.1044,  0.0201,  0.0990,  0.2482,  0.3116,  0.2509,  0.2866, -0.2190,
-             0.2166, -0.0212],
-           [-0.2000,  0.1107, -0.1999, -0.3119,  0.1559,  0.0993,  0.1776, -0.1950,
-            -0.1023, -0.0447],
-           [-0.0888, -0.1092,  0.2281,  0.0336,  0.1817, -0.0115,  0.2096,  0.1415,
-            -0.1876, -0.2467],
-           [ 0.2208, -0.2352, -0.1426, -0.2636, -0.2889, -0.2061, -0.2849, -0.0465,
-             0.2577,  0.0402],
-           [ 0.1502,  0.2465,  0.2566,  0.0693,  0.2352, -0.0530,  0.1859, -0.0604,
-             0.2132,  0.1680],
-           [ 0.1733, -0.2407, -0.1721,  0.1484,  0.0358, -0.0633, -0.0721, -0.0090,
-             0.2707, -0.2509],
-           [-0.1173,  0.1561,  0.2945,  0.0595, -0.1996,  0.2988, -0.0802,  0.0407,
-             0.1829, -0.1568],
-           [-0.1164, -0.2228, -0.0403,  0.0428,  0.1339,  0.0047,  0.1967,  0.2923,
-             0.0333, -0.0536],
-           [-0.1492, -0.1616,  0.1057,  0.1950, -0.2807, -0.2710, -0.1586,  0.0739,
-             0.2220,  0.2358]]).
+```
+tensor([[-0.0818,  0.2207, -0.0749, -0.0030,  0.0045, -0.1569, -0.1598,  0.0212,
+         -0.2077,  0.2157],
+        [ 0.1044,  0.0201,  0.0990,  0.2482,  0.3116,  0.2509,  0.2866, -0.2190,
+          0.2166, -0.0212],
+        [-0.2000,  0.1107, -0.1999, -0.3119,  0.1559,  0.0993,  0.1776, -0.1950,
+         -0.1023, -0.0447],
+        [-0.0888, -0.1092,  0.2281,  0.0336,  0.1817, -0.0115,  0.2096,  0.1415,
+         -0.1876, -0.2467],
+        [ 0.2208, -0.2352, -0.1426, -0.2636, -0.2889, -0.2061, -0.2849, -0.0465,
+          0.2577,  0.0402],
+        [ 0.1502,  0.2465,  0.2566,  0.0693,  0.2352, -0.0530,  0.1859, -0.0604,
+          0.2132,  0.1680],
+        [ 0.1733, -0.2407, -0.1721,  0.1484,  0.0358, -0.0633, -0.0721, -0.0090,
+          0.2707, -0.2509],
+        [-0.1173,  0.1561,  0.2945,  0.0595, -0.1996,  0.2988, -0.0802,  0.0407,
+          0.1829, -0.1568],
+        [-0.1164, -0.2228, -0.0403,  0.0428,  0.1339,  0.0047,  0.1967,  0.2923,
+          0.0333, -0.0536],
+        [-0.1492, -0.1616,  0.1057,  0.1950, -0.2807, -0.2710, -0.1586,  0.0739,
+          0.2220,  0.2358]]).
+```
 
 In the conversion script, you should fill those randomly initialized weights with the exact weights of the
 corresponding layer in the checkpoint. *E.g.*
 
-.. code:: python
+```python
+# retrieve matching layer weights, e.g. by 
+# recursive algorithm
+layer_name = "dense"
+pretrained_weight = array_of_dense_layer
 
-   # retrieve matching layer weights, e.g. by 
-   # recursive algorithm
-   layer_name = "dense"
-   pretrained_weight = array_of_dense_layer
+model_pointer = getattr(model, "dense")
 
-   model_pointer = getattr(model, "dense")
-
-   model_pointer.weight.data = torch.from_numpy(pretrained_weight)
+model_pointer.weight.data = torch.from_numpy(pretrained_weight)
+```
 
 While doing so, you must verify that each randomly initialized weight of your PyTorch model and its corresponding
 pretrained checkpoint weight exactly match in both **shape and name**. To do so, it is **necessary** to add assert
 statements for the shape and print out the names of the checkpoints weights. E.g. you should add statements like:
 
-.. code:: python
-
-   assert (
-        model_pointer.weight.shape == pretrained_weight.shape
-   ), f"Pointer shape of random weight {model_pointer.shape} and array shape of checkpoint weight {pretrained_weight.shape} mismatched"
+```python
+assert (
+    model_pointer.weight.shape == pretrained_weight.shape
+), f"Pointer shape of random weight {model_pointer.shape} and array shape of checkpoint weight {pretrained_weight.shape} mismatched"
+```
 
 Besides, you should also print out the names of both weights to make sure they match, *e.g.*
 
-.. code:: python
-
-   logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}")
+```python
+logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}")
+```
 
 If either the shape or the name doesn't match, you probably assigned the wrong checkpoint weight to a randomly
 initialized layer of the 🤗 Transformers implementation.
 
-An incorrect shape is most likely due to an incorrect setting of the config parameters in ``BrandNewBertConfig()`` that
+An incorrect shape is most likely due to an incorrect setting of the config parameters in `BrandNewBertConfig()` that
 do not exactly match those that were used for the checkpoint you want to convert. However, it could also be that
 PyTorch's implementation of a layer requires the weight to be transposed beforehand.
 
 Finally, you should also check that **all** required weights are initialized and print out all checkpoint weights that
 were not used for initialization to make sure the model is correctly converted. It is completely normal, that the
 conversion trials fail with either a wrong shape statement or wrong name assignment. This is most likely because either
-you used incorrect parameters in ``BrandNewBertConfig()``, have a wrong architecture in the 🤗 Transformers
-implementation, you have a bug in the ``init()`` functions of one of the components of the 🤗 Transformers
+you used incorrect parameters in `BrandNewBertConfig()`, have a wrong architecture in the 🤗 Transformers
+implementation, you have a bug in the `init()` functions of one of the components of the 🤗 Transformers
 implementation or you need to transpose one of the checkpoint weights.
 
 This step should be iterated with the previous step until all weights of the checkpoint are correctly loaded in the
 Transformers model. Having correctly loaded the checkpoint into the 🤗 Transformers implementation, you can then save
-the model under a folder of your choice ``/path/to/converted/checkpoint/folder`` that should then contain both a
-``pytorch_model.bin`` file and a ``config.json`` file:
+the model under a folder of your choice `/path/to/converted/checkpoint/folder` that should then contain both a
+`pytorch_model.bin` file and a `config.json` file:
 
-.. code:: python
-
-   model.save_pretrained("/path/to/converted/checkpoint/folder")
+```python
+model.save_pretrained("/path/to/converted/checkpoint/folder")
+```
 
 **7. Implement the forward pass**
 
 Having managed to correctly load the pretrained weights into the 🤗 Transformers implementation, you should now make
-sure that the forward pass is correctly implemented. In `Get familiar with the original repository
-<#run-a-pretrained-checkpoint-using-the-original-repository>`__, you have already created a script that runs a forward
+sure that the forward pass is correctly implemented. In [Get familiar with the original repository](#run-a-pretrained-checkpoint-using-the-original-repository), you have already created a script that runs a forward
 pass of the model using the original repository. Now you should write an analogous script using the 🤗 Transformers
 implementation instead of the original one. It should look as follows:
 
-.. code:: python
-
-   model = BrandNewBertModel.from_pretrained(/path/to/converted/checkpoint/folder)
-   input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]
-   output = model(input_ids).last_hidden_states
+```python
+model = BrandNewBertModel.from_pretrained(/path/to/converted/checkpoint/folder)
+input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]
+output = model(input_ids).last_hidden_states
+```
 
 It is very likely that the 🤗 Transformers implementation and the original model implementation don't give the exact
 same output the very first time or that the forward pass throws an error. Don't be disappointed - it's expected! First,
 you should make sure that the forward pass doesn't throw any errors. It often happens that the wrong dimensions are
-used leading to a `Dimensionality mismatch` error or that the wrong data type object is used, *e.g.* ``torch.long``
-instead of ``torch.float32``. Don't hesitate to ask the Hugging Face team for help, if you don't manage to solve
+used leading to a *Dimensionality mismatch* error or that the wrong data type object is used, *e.g.* `torch.long`
+instead of `torch.float32`. Don't hesitate to ask the Hugging Face team for help, if you don't manage to solve
 certain errors.
 
 The final part to make sure the 🤗 Transformers implementation works correctly is to ensure that the outputs are
-equivalent to a precision of ``1e-3``. First, you should ensure that the output shapes are identical, *i.e.*
-``outputs.shape`` should yield the same value for the script of the 🤗 Transformers implementation and the original
+equivalent to a precision of `1e-3`. First, you should ensure that the output shapes are identical, *i.e.*
+`outputs.shape` should yield the same value for the script of the 🤗 Transformers implementation and the original
 implementation. Next, you should make sure that the output values are identical as well. This one of the most difficult
 parts of adding a new model. Common mistakes why the outputs are not identical are:
 
--  Some layers were not added, *i.e.* an `activation` layer was not added, or the residual connection was forgotten
--  The word embedding matrix was not tied
--  The wrong positional embeddings are used because the original implementation uses on offset
--  Dropout is applied during the forward pass. To fix this make sure `model.training is False` and that no dropout
-   layer is falsely activated during the forward pass, *i.e.* pass `self.training` to `PyTorch's functional dropout
-   <https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout>`_
+- Some layers were not added, *i.e.* an *activation* layer was not added, or the residual connection was forgotten
+- The word embedding matrix was not tied
+- The wrong positional embeddings are used because the original implementation uses on offset
+- Dropout is applied during the forward pass. To fix this make sure *model.training is False* and that no dropout
+  layer is falsely activated during the forward pass, *i.e.* pass *self.training* to [PyTorch's functional dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout)
 
 The best way to fix the problem is usually to look at the forward pass of the original implementation and the 🤗
 Transformers implementation side-by-side and check if there are any differences. Ideally, you should debug/print out
 intermediate outputs of both implementations of the forward pass to find the exact position in the network where the 🤗
 Transformers implementation shows a different output than the original implementation. First, make sure that the
-hard-coded ``input_ids`` in both scripts are identical. Next, verify that the outputs of the first transformation of
-the ``input_ids`` (usually the word embeddings) are identical. And then work your way up to the very last layer of the
+hard-coded `input_ids` in both scripts are identical. Next, verify that the outputs of the first transformation of
+the `input_ids` (usually the word embeddings) are identical. And then work your way up to the very last layer of the
 network. At some point, you will notice a difference between the two implementations, which should point you to the bug
 in the 🤗 Transformers implementation. From our experience, a simple and efficient way is to add many print statements
 in both the original implementation and 🤗 Transformers implementation, at the same positions in the network
 respectively, and to successively remove print statements showing the same values for intermediate presentations.
 
 When you're confident that both implementations yield the same output, verifying the outputs with
-``torch.allclose(original_output, output, atol=1e-3)``, you're done with the most difficult part! Congratulations - the
+`torch.allclose(original_output, output, atol=1e-3)`, you're done with the most difficult part! Congratulations - the
 work left to be done should be a cakewalk 😊.
 
 **8. Adding all necessary model tests**
@@ -685,42 +666,39 @@ work left to be done should be a cakewalk 😊.
 At this point, you have successfully added a new model. However, it is very much possible that the model does not yet
 fully comply with the required design. To make sure, the implementation is fully compatible with 🤗 Transformers, all
 common tests should pass. The Cookiecutter should have automatically added a test file for your model, probably under
-the same ``tests/test_modeling_brand_new_bert.py``. Run this test file to verify that all common tests pass:
+the same `tests/test_modeling_brand_new_bert.py`. Run this test file to verify that all common tests pass:
 
-.. code:: python
-
-   pytest tests/test_modeling_brand_new_bert.py
+```python
+pytest tests/test_modeling_brand_new_bert.py
+```
 
 Having fixed all common tests, it is now crucial to ensure that all the nice work you have done is well tested, so that
 
--  
-
-   a) The community can easily understand your work by looking at specific tests of *brand_new_bert*
-
--  
-
-   b) Future changes to your model will not break any important feature of the model.
+- a) The community can easily understand your work by looking at specific tests of *brand_new_bert*
+- b) Future changes to your model will not break any important feature of the model.
 
 At first, integration tests should be added. Those integration tests essentially do the same as the debugging scripts
 you used earlier to implement the model to 🤗 Transformers. A template of those model tests is already added by the
-Cookiecutter, called ``BrandNewBertModelIntegrationTests`` and only has to be filled out by you. To ensure that those
+Cookiecutter, called `BrandNewBertModelIntegrationTests` and only has to be filled out by you. To ensure that those
 tests are passing, run
 
-.. code:: python
+```bash
+RUN_SLOW=1 pytest -sv tests/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests
+```
 
-   RUN_SLOW=1 pytest -sv tests/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests
+<Tip>
 
-.. note::
+In case you are using Windows, you should replace `RUN_SLOW=1` with `SET RUN_SLOW=1`
 
-  In case you are using Windows, you should replace ``RUN_SLOW=1`` with ``SET RUN_SLOW=1``
+</Tip>
 
 Second, all features that are special to *brand_new_bert* should be tested additionally in a separate test under
-``BrandNewBertModelTester``/``BrandNewBertModelTest``. This part is often forgotten but is extremely useful in two
+`BrandNewBertModelTester`/``BrandNewBertModelTest`. This part is often forgotten but is extremely useful in two
 ways:
 
--  It helps to transfer the knowledge you have acquired during the model addition to the community by showing how the
-   special features of *brand_new_bert* should work.
--  Future contributors can quickly test changes to the model by running those special tests.
+- It helps to transfer the knowledge you have acquired during the model addition to the community by showing how the
+  special features of *brand_new_bert* should work.
+- Future contributors can quickly test changes to the model by running those special tests.
 
 
 **9. Implement the tokenizer**
@@ -732,29 +710,29 @@ It is very important to find/extract the original tokenizer file and to manage t
 Transformers' implementation of the tokenizer.
 
 To ensure that the tokenizer works correctly, it is recommended to first create a script in the original repository
-that inputs a string and returns the ``input_ids``. It could look similar to this (in pseudo-code):
+that inputs a string and returns the `input_ids``. It could look similar to this (in pseudo-code):
 
-.. code:: bash
-
-   input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
-   model = BrandNewBertModel.load_pretrained_checkpoint(/path/to/checkpoint/)
-   input_ids = model.tokenize(input_str)
+```python
+input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
+model = BrandNewBertModel.load_pretrained_checkpoint(/path/to/checkpoint/)
+input_ids = model.tokenize(input_str)
+```
 
 You might have to take a deeper look again into the original repository to find the correct tokenizer function or you
-might even have to do changes to your clone of the original repository to only output the ``input_ids``. Having written
+might even have to do changes to your clone of the original repository to only output the `input_ids`. Having written
 a functional tokenization script that uses the original repository, an analogous script for 🤗 Transformers should be
 created. It should look similar to this:
 
-.. code:: python
+```python
+from transformers import BrandNewBertTokenizer
+input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
 
-   from transformers import BrandNewBertTokenizer
-   input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
+tokenizer = BrandNewBertTokenizer.from_pretrained(/path/to/tokenizer/folder/)
 
-   tokenizer = BrandNewBertTokenizer.from_pretrained(/path/to/tokenizer/folder/)
+input_ids = tokenizer(input_str).input_ids
+```
 
-   input_ids = tokenizer(input_str).input_ids
-
-When both ``input_ids`` yield the same values, as a final step a tokenizer test file should also be added.
+When both `input_ids` yield the same values, as a final step a tokenizer test file should also be added.
 
 Analogous to the modeling test files of *brand_new_bert*, the tokenization test files of *brand_new_bert* should
 contain a couple of hard-coded integration tests.
@@ -762,12 +740,12 @@ contain a couple of hard-coded integration tests.
 **10. Run End-to-end integration tests**
 
 Having added the tokenizer, you should also add a couple of end-to-end integration tests using both the model and the
-tokenizer to ``tests/test_modeling_brand_new_bert.py`` in 🤗 Transformers. Such a test should show on a meaningful
+tokenizer to `tests/test_modeling_brand_new_bert.py` in 🤗 Transformers. Such a test should show on a meaningful
 text-to-text sample that the 🤗 Transformers implementation works as expected. A meaningful text-to-text sample can
 include *e.g.* a source-to-target-translation pair, an article-to-summary pair, a question-to-answer pair, etc… If none
 of the ported checkpoints has been fine-tuned on a downstream task it is enough to simply rely on the model tests. In a
 final step to ensure that the model is fully functional, it is advised that you also run all tests on GPU. It can
-happen that you forgot to add some ``.to(self.device)`` statements to internal tensors of the model, which in such a
+happen that you forgot to add some `.to(self.device)` statements to internal tensors of the model, which in such a
 test would show in an error. In case you have no access to a GPU, the Hugging Face team can take care of running those
 tests for you.
 
@@ -775,12 +753,12 @@ tests for you.
 
 Now, all the necessary functionality for *brand_new_bert* is added - you're almost done! The only thing left to add is
 a nice docstring and a doc page. The Cookiecutter should have added a template file called
-``docs/source/model_doc/brand_new_bert.rst`` that you should fill out. Users of your model will usually first look at
+`docs/source/model_doc/brand_new_bert.rst` that you should fill out. Users of your model will usually first look at
 this page before using your model. Hence, the documentation must be understandable and concise. It is very useful for
 the community to add some *Tips* to show how the model should be used. Don't hesitate to ping the Hugging Face team
 regarding the docstrings.
 
-Next, make sure that the docstring added to ``src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`` is
+Next, make sure that the docstring added to `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` is
 correct and included all necessary inputs and outputs. It is always to good to remind oneself that documentation should
 be treated at least as carefully as the code in 🤗 Transformers since the documentation is usually the first contact
 point of the community with the model.
@@ -790,15 +768,15 @@ point of the community with the model.
 Great, now you have added all the necessary code for *brand_new_bert*. At this point, you should correct some potential
 incorrect code style by running:
 
-.. code:: bash
-
-   make style
+```bash
+make style
+```
 
 and verify that your coding style passes the quality check:
 
-.. code:: bash
-
-   make quality
+```bash
+make quality
+```
 
 There are a couple of other very strict design tests in 🤗 Transformers that might still be failing, which shows up in
 the tests of your pull request. This is often because of some missing information in the docstring or some incorrect
@@ -833,8 +811,7 @@ Hugging Face team should have helped you already at this point, but it is worth
 PR a nice description and eventually add comments to your code, if you want to point out certain design choices to your
 reviewer.
 
-Share your work!!
------------------------------------------------------------------------------------------------------------------------
+### Share your work!!
 
 Now, it's time to get some credit from the community for your work! Having completed a model addition is a major
 contribution to Transformers and the whole NLP community. Your code and the ported pre-trained models will certainly be
diff --git a/docs/source/add_new_pipeline.mdx b/docs/source/add_new_pipeline.mdx
new file mode 100644
index 0000000000..661e92a449
--- /dev/null
+++ b/docs/source/add_new_pipeline.mdx
@@ -0,0 +1,138 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+-->
+
+# How to add a pipeline to 🤗 Transformers?
+
+First and foremost, you need to decide the raw entries the pipeline will be able to take. It can be strings, raw bytes,
+dictionaries or whatever seems to be the most likely desired input. Try to keep these inputs as pure Python as possible
+as it makes compatibility easier (even through other languages via JSON). Those will be the `inputs` of the
+pipeline (`preprocess`).
+
+Then define the `outputs`. Same policy as the `inputs`. The simpler, the better. Those will be the outputs of
+`postprocess` method.
+
+Start by inheriting the base class `Pipeline`. with the 4 methods needed to implement `preprocess`,
+`_forward`, `postprocess` and `_sanitize_parameters`.
+
+
+```python
+from transformers import Pipeline
+
+class MyPipeline(Pipeline):
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        if "maybe_arg" in kwargs:
+            preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
+        return preprocess_kwargs, {}, {}
+
+    def preprocess(self, inputs, maybe_arg=2):
+        model_input = Tensor(....)
+        return {"model_input": model_input}
+
+    def _forward(self, model_inputs):
+        # model_inputs == {"model_input": model_input}
+        outputs = self.model(**model_inputs)
+        # Maybe {"logits": Tensor(...)}
+        return outputs
+
+    def postprocess(self, model_outputs):
+        best_class = model_outputs["logits"].softmax(-1)
+        return best_class
+```
+
+The structure of this breakdown is to support relatively seamless support for CPU/GPU, while supporting doing
+pre/postprocessing on the CPU on different threads
+
+`preprocess` will take the originally defined inputs, and turn them into something feedable to the model. It might
+contain more information and is usually a `Dict`.
+
+`_forward` is the implementation detail and is not meant to be called directly. `forward` is the preferred
+called method as it contains safeguards to make sure everything is working on the expected device. If anything is
+linked to a real model it belongs in the `_forward` method, anything else is in the preprocess/postprocess.
+
+`postprocess` methods will take the output of `_forward` and turn it into the final output that were decided
+earlier.
+
+`_sanitize_parameters` exists to allow users to pass any parameters whenever they wish, be it at initialization
+time `pipeline(...., maybe_arg=4)` or at call time `pipe = pipeline(...); output = pipe(...., maybe_arg=4)`.
+
+The returns of `_sanitize_parameters` are the 3 dicts of kwargs that will be passed directly to `preprocess`,
+`_forward` and `postprocess`. Don't fill anything if the caller didn't call with any extra parameter. That
+allows to keep the default arguments in the function definition which is always more "natural".
+
+A classic example would be a `top_k` argument in the post processing in classification tasks.
+
+```python
+>>> pipe = pipeline("my-new-task")
+>>> pipe("This is a test")
+[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05}
+{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}]
+
+>>> pipe("This is a test", top_k=2)
+[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}]
+```
+
+In order to achieve that, we'll update our `postprocess` method with a default parameter to `5`. and edit
+`_sanitize_parameters` to allow this new parameter.
+
+
+```python
+def postprocess(self, model_outputs, top_k=5):
+    best_class = model_outputs["logits"].softmax(-1)
+    # Add logic to handle top_k
+    return best_class
+
+def _sanitize_parameters(self, **kwargs):
+    preprocess_kwargs = {}
+    if "maybe_arg" in kwargs:
+        preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
+
+    postprocess_kwargs = {}
+    if "top_k" in kwargs:
+        preprocess_kwargs["top_k"] = kwargs["top_k"]
+    return preprocess_kwargs, {}, postprocess_kwargs
+```
+
+Try to keep the inputs/outputs very simple and ideally JSON-serializable as it makes the pipeline usage very easy
+without requiring users to understand new kind of objects. It's also relatively common to support many different types
+of arguments for ease of use (audio files, can be filenames, URLs or pure bytes)
+
+
+
+## Adding it to the list of supported tasks
+
+Go to `src/transformers/pipelines/__init__.py` and fill in `SUPPORTED_TASKS` with your newly created pipeline.
+If possible it should provide a default model.
+
+## Adding tests
+
+Create a new file `tests/test_pipelines_MY_PIPELINE.py` with example with the other tests.
+
+The `run_pipeline_test` function will be very generic and run on small random models on every possible
+architecture as defined by `model_mapping` and `tf_model_mapping`.
+
+This is very important to test future compatibility, meaning if someone adds a new model for
+`XXXForQuestionAnswering` then the pipeline test will attempt to run on it. Because the models are random it's
+impossible to check for actual values, that's why There is a helper `ANY` that will simply attempt to match the
+output of the pipeline TYPE.
+
+You also *need* to implement 2 (ideally 4) tests.
+
+- `test_small_model_pt` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
+  and test the pipeline outputs. The results should be the same as `test_small_model_tf`.
+- `test_small_model_tf` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
+  and test the pipeline outputs. The results should be the same as `test_small_model_pt`.
+- `test_large_model_pt` (`optional`): Tests the pipeline on a real pipeline where the results are supposed to
+  make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
+  sure there is no drift in future releases
+- `test_large_model_tf` (`optional`): Tests the pipeline on a real pipeline where the results are supposed to
+  make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
+  sure there is no drift in future releases
diff --git a/docs/source/add_new_pipeline.rst b/docs/source/add_new_pipeline.rst
deleted file mode 100644
index ccf587968f..0000000000
--- a/docs/source/add_new_pipeline.rst
+++ /dev/null
@@ -1,143 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-
-How to add a pipeline to 🤗 Transformers?
-=======================================================================================================================
-
-First and foremost, you need to decide the raw entries the pipeline will be able to take. It can be strings, raw bytes,
-dictionaries or whatever seems to be the most likely desired input. Try to keep these inputs as pure Python as possible
-as it makes compatibility easier (even through other languages via JSON). Those will be the :obj:`inputs` of the
-pipeline (:obj:`preprocess`).
-
-Then define the :obj:`outputs`. Same policy as the :obj:`inputs`. The simpler, the better. Those will be the outputs of
-:obj:`postprocess` method.
-
-Start by inheriting the base class :obj:`Pipeline`. with the 4 methods needed to implement :obj:`preprocess`,
-:obj:`_forward`, :obj:`postprocess` and :obj:`_sanitize_parameters`.
-
-
-.. code-block::
-
-    from transformers import Pipeline
-
-    class MyPipeline(Pipeline):
-        def _sanitize_parameters(self, **kwargs):
-            preprocess_kwargs = {}
-            if "maybe_arg" in kwargs:
-                preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
-            return preprocess_kwargs, {}, {}
-
-        def preprocess(self, inputs, maybe_arg=2):
-            model_input = Tensor(....)
-            return {"model_input": model_input}
-
-        def _forward(self, model_inputs):
-            # model_inputs == {"model_input": model_input}
-            outputs = self.model(**model_inputs)
-            # Maybe {"logits": Tensor(...)}
-            return outputs
-
-        def postprocess(self, model_outputs):
-            best_class = model_outputs["logits"].softmax(-1)
-            return best_class
-
-
-The structure of this breakdown is to support relatively seamless support for CPU/GPU, while supporting doing
-pre/postprocessing on the CPU on different threads
-
-:obj:`preprocess` will take the originally defined inputs, and turn them into something feedable to the model. It might
-contain more information and is usually a :obj:`Dict`.
-
-:obj:`_forward` is the implementation detail and is not meant to be called directly. :obj:`forward` is the preferred
-called method as it contains safeguards to make sure everything is working on the expected device. If anything is
-linked to a real model it belongs in the :obj:`_forward` method, anything else is in the preprocess/postprocess.
-
-:obj:`postprocess` methods will take the output of :obj:`_forward` and turn it into the final output that were decided
-earlier.
-
-:obj:`_sanitize_parameters` exists to allow users to pass any parameters whenever they wish, be it at initialization
-time ``pipeline(...., maybe_arg=4)`` or at call time ``pipe = pipeline(...); output = pipe(...., maybe_arg=4)``.
-
-The returns of :obj:`_sanitize_parameters` are the 3 dicts of kwargs that will be passed directly to :obj:`preprocess`,
-:obj:`_forward` and :obj:`postprocess`. Don't fill anything if the caller didn't call with any extra parameter. That
-allows to keep the default arguments in the function definition which is always more "natural".
-
-A classic example would be a :obj:`top_k` argument in the post processing in classification tasks.
-
-.. code-block::
-
-    >>> pipe = pipeline("my-new-task")
-    >>> pipe("This is a test")
-    [{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05}
-    {"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}]
-
-    >>> pipe("This is a test", top_k=2)
-    [{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}]
-
-In order to achieve that, we'll update our :obj:`postprocess` method with a default parameter to :obj:`5`. and edit
-:obj:`_sanitize_parameters` to allow this new parameter.
-
-
-.. code-block::
-
-
-        def postprocess(self, model_outputs, top_k=5):
-            best_class = model_outputs["logits"].softmax(-1)
-            # Add logic to handle top_k
-            return best_class
-
-        def _sanitize_parameters(self, **kwargs):
-            preprocess_kwargs = {}
-            if "maybe_arg" in kwargs:
-                preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
-
-            postprocess_kwargs = {}
-            if "top_k" in kwargs:
-                preprocess_kwargs["top_k"] = kwargs["top_k"]
-            return preprocess_kwargs, {}, postprocess_kwargs
-
-Try to keep the inputs/outputs very simple and ideally JSON-serializable as it makes the pipeline usage very easy
-without requiring users to understand new kind of objects. It's also relatively common to support many different types
-of arguments for ease of use (audio files, can be filenames, URLs or pure bytes)
-
-
-
-Adding it to the list of supported tasks
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Go to ``src/transformers/pipelines/__init__.py`` and fill in :obj:`SUPPORTED_TASKS` with your newly created pipeline.
-If possible it should provide a default model.
-
-Adding tests
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Create a new file ``tests/test_pipelines_MY_PIPELINE.py`` with example with the other tests.
-
-The :obj:`run_pipeline_test` function will be very generic and run on small random models on every possible
-architecture as defined by :obj:`model_mapping` and :obj:`tf_model_mapping`.
-
-This is very important to test future compatibility, meaning if someone adds a new model for
-:obj:`XXXForQuestionAnswering` then the pipeline test will attempt to run on it. Because the models are random it's
-impossible to check for actual values, that's why There is a helper :obj:`ANY` that will simply attempt to match the
-output of the pipeline TYPE.
-
-You also *need* to implement 2 (ideally 4) tests.
-
-- :obj:`test_small_model_pt` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
-  and test the pipeline outputs. The results should be the same as :obj:`test_small_model_tf`.
-- :obj:`test_small_model_tf` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
-  and test the pipeline outputs. The results should be the same as :obj:`test_small_model_pt`.
-- :obj:`test_large_model_pt` (:obj:`optional`): Tests the pipeline on a real pipeline where the results are supposed to
-  make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
-  sure there is no drift in future releases
-- :obj:`test_large_model_tf` (:obj:`optional`): Tests the pipeline on a real pipeline where the results are supposed to
-  make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
-  sure there is no drift in future releases
diff --git a/docs/source/bertology.mdx b/docs/source/bertology.mdx
new file mode 100644
index 0000000000..b0dd264da8
--- /dev/null
+++ b/docs/source/bertology.mdx
@@ -0,0 +1,36 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# BERTology
+
+There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT
+(that some call "BERTology"). Some good examples of this field are:
+
+
+- BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick:
+  https://arxiv.org/abs/1905.05950
+- Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
+- What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D.
+  Manning: https://arxiv.org/abs/1906.04341
+
+In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to
+help people access the inner representations, mainly adapted from the great work of Paul Michel
+(https://arxiv.org/abs/1905.10650):
+
+
+- accessing all the hidden-states of BERT/GPT/GPT-2,
+- accessing all the attention weights for each head of BERT/GPT/GPT-2,
+- retrieving heads output values and gradients to be able to compute head importance score and prune head as explained
+  in https://arxiv.org/abs/1905.10650.
+
+To help you understand and use these features, we have added a specific example script: [bertology.py](https://github.com/huggingface/transformers/tree/master/examples/research_projects/bertology/run_bertology.py) while extract information and prune a model pre-trained on
+GLUE.
diff --git a/docs/source/bertology.rst b/docs/source/bertology.rst
deleted file mode 100644
index 79fa34abfc..0000000000
--- a/docs/source/bertology.rst
+++ /dev/null
@@ -1,38 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-BERTology
------------------------------------------------------------------------------------------------------------------------
-
-There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT
-(that some call "BERTology"). Some good examples of this field are:
-
-
-* BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick:
-  https://arxiv.org/abs/1905.05950
-* Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
-* What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D.
-  Manning: https://arxiv.org/abs/1906.04341
-
-In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to
-help people access the inner representations, mainly adapted from the great work of Paul Michel
-(https://arxiv.org/abs/1905.10650):
-
-
-* accessing all the hidden-states of BERT/GPT/GPT-2,
-* accessing all the attention weights for each head of BERT/GPT/GPT-2,
-* retrieving heads output values and gradients to be able to compute head importance score and prune head as explained
-  in https://arxiv.org/abs/1905.10650.
-
-To help you understand and use these features, we have added a specific example script: :prefix_link:`bertology.py
-<examples/research_projects/bertology/run_bertology.py>` while extract information and prune a model pre-trained on
-GLUE.
diff --git a/docs/source/community.md b/docs/source/community.mdx
similarity index 100%
rename from docs/source/community.md
rename to docs/source/community.mdx
diff --git a/docs/source/converting_tensorflow_models.mdx b/docs/source/converting_tensorflow_models.mdx
new file mode 100644
index 0000000000..de83cd02bd
--- /dev/null
+++ b/docs/source/converting_tensorflow_models.mdx
@@ -0,0 +1,162 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Converting Tensorflow Checkpoints
+
+A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints to models
+that can be loaded using the `from_pretrained` methods of the library.
+
+<Tip>
+
+Since 2.3.0 the conversion script is now part of the transformers CLI (**transformers-cli**) available in any
+transformers >= 2.3.0 installation.
+
+The documentation below reflects the **transformers-cli convert** command format.
+
+</Tip>
+
+## BERT
+
+You can convert any TensorFlow checkpoint for BERT (in particular [the pre-trained models released by Google](https://github.com/google-research/bert#pre-trained-models)) in a PyTorch save file by using the
+[convert_bert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/master/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py) script.
+
+This CLI takes as input a TensorFlow checkpoint (three files starting with `bert_model.ckpt`) and the associated
+configuration file (`bert_config.json`), and creates a PyTorch model for this configuration, loads the weights from
+the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can
+be imported using `from_pretrained()` (see example in [quicktour](quicktour) , [run_glue.py](https://github.com/huggingface/transformers/tree/master/examples/pytorch/text-classification/run_glue.py) ).
+
+You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow
+checkpoint (the three files starting with `bert_model.ckpt`) but be sure to keep the configuration file (\
+`bert_config.json`) and the vocabulary file (`vocab.txt`) as these are needed for the PyTorch model too.
+
+To run this specific conversion script you will need to have TensorFlow and PyTorch installed (`pip install tensorflow`). The rest of the repository only requires PyTorch.
+
+Here is an example of the conversion process for a pre-trained `BERT-Base Uncased` model:
+
+```bash
+export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
+
+transformers-cli convert --model_type bert \
+  --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
+  --config $BERT_BASE_DIR/bert_config.json \
+  --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
+```
+
+You can download Google's pre-trained models for the conversion [here](https://github.com/google-research/bert#pre-trained-models).
+
+## ALBERT
+
+Convert TensorFlow model checkpoints of ALBERT to PyTorch using the
+[convert_albert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/master/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py) script.
+
+The CLI takes as input a TensorFlow checkpoint (three files starting with `model.ckpt-best`) and the accompanying
+configuration file (`albert_config.json`), then creates and saves a PyTorch model. To run this conversion you will
+need to have TensorFlow and PyTorch installed.
+
+Here is an example of the conversion process for the pre-trained `ALBERT Base` model:
+
+```bash
+export ALBERT_BASE_DIR=/path/to/albert/albert_base
+
+transformers-cli convert --model_type albert \
+  --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
+  --config $ALBERT_BASE_DIR/albert_config.json \
+  --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
+```
+
+You can download Google's pre-trained models for the conversion [here](https://github.com/google-research/albert#pre-trained-models).
+
+## OpenAI GPT
+
+Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint
+save as the same format than OpenAI pretrained model (see [here](https://github.com/openai/finetune-transformer-lm)\
+)
+
+```bash
+export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
+
+transformers-cli convert --model_type gpt \
+  --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+  [--config OPENAI_GPT_CONFIG] \
+  [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
+```
+
+## OpenAI GPT-2
+
+Here is an example of the conversion process for a pre-trained OpenAI GPT-2 model (see [here](https://github.com/openai/gpt-2))
+
+```bash
+export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
+
+transformers-cli convert --model_type gpt2 \
+  --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+  [--config OPENAI_GPT2_CONFIG] \
+  [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
+```
+
+## Transformer-XL
+
+Here is an example of the conversion process for a pre-trained Transformer-XL model (see [here](https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models))
+
+```bash
+export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
+
+transformers-cli convert --model_type transfo_xl \
+  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+  [--config TRANSFO_XL_CONFIG] \
+  [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
+```
+
+## XLNet
+
+Here is an example of the conversion process for a pre-trained XLNet model:
+
+```bash
+export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
+export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
+
+transformers-cli convert --model_type xlnet \
+  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
+  --config $TRANSFO_XL_CONFIG_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+  [--finetuning_task_name XLNET_FINETUNED_TASK] \
+```
+
+## XLM
+
+Here is an example of the conversion process for a pre-trained XLM model:
+
+```bash
+export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
+
+transformers-cli convert --model_type xlm \
+  --tf_checkpoint $XLM_CHECKPOINT_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
+ [--config XML_CONFIG] \
+ [--finetuning_task_name XML_FINETUNED_TASK]
+```
+
+## T5
+
+Here is an example of the conversion process for a pre-trained T5 model:
+
+```bash
+export T5=/path/to/t5/uncased_L-12_H-768_A-12
+
+transformers-cli convert --model_type t5 \
+  --tf_checkpoint $T5/t5_model.ckpt \
+  --config $T5/t5_config.json \
+  --pytorch_dump_output $T5/pytorch_model.bin
+```
diff --git a/docs/source/converting_tensorflow_models.rst b/docs/source/converting_tensorflow_models.rst
deleted file mode 100644
index 85e9591503..0000000000
--- a/docs/source/converting_tensorflow_models.rst
+++ /dev/null
@@ -1,181 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Converting Tensorflow Checkpoints
-=======================================================================================================================
-
-A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints to models
-that can be loaded using the ``from_pretrained`` methods of the library.
-
-.. note::
-    Since 2.3.0 the conversion script is now part of the transformers CLI (**transformers-cli**) available in any
-    transformers >= 2.3.0 installation.
-
-    The documentation below reflects the **transformers-cli convert** command format.
-
-BERT
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google
-<https://github.com/google-research/bert#pre-trained-models>`_) in a PyTorch save file by using the
-:prefix_link:`convert_bert_original_tf_checkpoint_to_pytorch.py
-<src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py>` script.
-
-This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``) and the associated
-configuration file (``bert_config.json``), and creates a PyTorch model for this configuration, loads the weights from
-the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can
-be imported using ``from_pretrained()`` (see example in :doc:`quicktour` , :prefix_link:`run_glue.py
-<examples/pytorch/text-classification/run_glue.py>` ).
-
-You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow
-checkpoint (the three files starting with ``bert_model.ckpt``) but be sure to keep the configuration file (\
-``bert_config.json``) and the vocabulary file (``vocab.txt``) as these are needed for the PyTorch model too.
-
-To run this specific conversion script you will need to have TensorFlow and PyTorch installed (``pip install
-tensorflow``). The rest of the repository only requires PyTorch.
-
-Here is an example of the conversion process for a pre-trained ``BERT-Base Uncased`` model:
-
-.. code-block:: shell
-
-    export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
-
-    transformers-cli convert --model_type bert \
-      --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
-      --config $BERT_BASE_DIR/bert_config.json \
-      --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
-
-You can download Google's pre-trained models for the conversion `here
-<https://github.com/google-research/bert#pre-trained-models>`__.
-
-ALBERT
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Convert TensorFlow model checkpoints of ALBERT to PyTorch using the
-:prefix_link:`convert_albert_original_tf_checkpoint_to_pytorch.py
-<src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py>` script.
-
-The CLI takes as input a TensorFlow checkpoint (three files starting with ``model.ckpt-best``) and the accompanying
-configuration file (``albert_config.json``), then creates and saves a PyTorch model. To run this conversion you will
-need to have TensorFlow and PyTorch installed.
-
-Here is an example of the conversion process for the pre-trained ``ALBERT Base`` model:
-
-.. code-block:: shell
-
-    export ALBERT_BASE_DIR=/path/to/albert/albert_base
-
-    transformers-cli convert --model_type albert \
-      --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
-      --config $ALBERT_BASE_DIR/albert_config.json \
-      --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
-
-You can download Google's pre-trained models for the conversion `here
-<https://github.com/google-research/albert#pre-trained-models>`__.
-
-OpenAI GPT
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint
-save as the same format than OpenAI pretrained model (see `here <https://github.com/openai/finetune-transformer-lm>`__\
-)
-
-.. code-block:: shell
-
-    export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
-
-    transformers-cli convert --model_type gpt \
-      --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
-      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-      [--config OPENAI_GPT_CONFIG] \
-      [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
-
-
-OpenAI GPT-2
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Here is an example of the conversion process for a pre-trained OpenAI GPT-2 model (see `here
-<https://github.com/openai/gpt-2>`__)
-
-.. code-block:: shell
-
-    export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
-
-    transformers-cli convert --model_type gpt2 \
-      --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
-      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-      [--config OPENAI_GPT2_CONFIG] \
-      [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
-
-Transformer-XL
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Here is an example of the conversion process for a pre-trained Transformer-XL model (see `here
-<https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models>`__)
-
-.. code-block:: shell
-
-    export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
-
-    transformers-cli convert --model_type transfo_xl \
-      --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
-      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-      [--config TRANSFO_XL_CONFIG] \
-      [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
-
-
-XLNet
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Here is an example of the conversion process for a pre-trained XLNet model:
-
-.. code-block:: shell
-
-    export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
-    export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
-
-    transformers-cli convert --model_type xlnet \
-      --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
-      --config $TRANSFO_XL_CONFIG_PATH \
-      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-      [--finetuning_task_name XLNET_FINETUNED_TASK] \
-
-
-XLM
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Here is an example of the conversion process for a pre-trained XLM model:
-
-.. code-block:: shell
-
-    export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
-
-    transformers-cli convert --model_type xlm \
-      --tf_checkpoint $XLM_CHECKPOINT_PATH \
-      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
-     [--config XML_CONFIG] \
-     [--finetuning_task_name XML_FINETUNED_TASK]
-
-
-T5
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Here is an example of the conversion process for a pre-trained T5 model:
-
-.. code-block:: shell
-
-    export T5=/path/to/t5/uncased_L-12_H-768_A-12
-
-    transformers-cli convert --model_type t5 \
-      --tf_checkpoint $T5/t5_model.ckpt \
-      --config $T5/t5_config.json \
-      --pytorch_dump_output $T5/pytorch_model.bin
diff --git a/docs/source/fast_tokenizers.mdx b/docs/source/fast_tokenizers.mdx
new file mode 100644
index 0000000000..22b887b353
--- /dev/null
+++ b/docs/source/fast_tokenizers.mdx
@@ -0,0 +1,70 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Using tokenizers from 🤗 Tokenizers
+
+The [`PreTrainedTokenizerFast`] depends on the [🤗 Tokenizers](https://huggingface.co/docs/tokenizers) library. The tokenizers obtained from the 🤗 Tokenizers library can be
+loaded very simply into 🤗 Transformers.
+
+Before getting in the specifics, let's first start by creating a dummy tokenizer in a few lines:
+
+```python
+>>> from tokenizers import Tokenizer
+>>> from tokenizers.models import BPE
+>>> from tokenizers.trainers import BpeTrainer
+>>> from tokenizers.pre_tokenizers import Whitespace
+
+>>> tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
+>>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
+
+>>> tokenizer.pre_tokenizer = Whitespace()
+>>> files = [...]
+>>> tokenizer.train(files, trainer)
+```
+
+We now have a tokenizer trained on the files we defined. We can either continue using it in that runtime, or save it to
+a JSON file for future re-use.
+
+## Loading directly from the tokenizer object
+
+Let's see how to leverage this tokenizer object in the 🤗 Transformers library. The
+[`PreTrainedTokenizerFast`] class allows for easy instantiation, by accepting the instantiated
+*tokenizer* object as an argument:
+
+```python
+>>> from transformers import PreTrainedTokenizerFast
+
+>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
+```
+
+This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to [the tokenizer
+page](main_classes/tokenizer) for more information.
+
+## Loading from a JSON file
+
+In order to load a tokenizer from a JSON file, let's first start by saving our tokenizer:
+
+```python
+>>> tokenizer.save("tokenizer.json")
+```
+
+The path to which we saved this file can be passed to the [`PreTrainedTokenizerFast`] initialization
+method using the `tokenizer_file` parameter:
+
+```python
+>>> from transformers import PreTrainedTokenizerFast
+
+>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
+```
+
+This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to [the tokenizer
+page](main_classes/tokenizer) for more information.
diff --git a/docs/source/fast_tokenizers.rst b/docs/source/fast_tokenizers.rst
deleted file mode 100644
index 52584b7eb4..0000000000
--- a/docs/source/fast_tokenizers.rst
+++ /dev/null
@@ -1,62 +0,0 @@
-Using tokenizers from 🤗 Tokenizers
-=======================================================================================================================
-
-The :class:`~transformers.PreTrainedTokenizerFast` depends on the `tokenizers
-<https://huggingface.co/docs/tokenizers>`__ library. The tokenizers obtained from the 🤗 Tokenizers library can be
-loaded very simply into 🤗 Transformers.
-
-Before getting in the specifics, let's first start by creating a dummy tokenizer in a few lines:
-
-.. code-block::
-
-    >>> from tokenizers import Tokenizer
-    >>> from tokenizers.models import BPE
-    >>> from tokenizers.trainers import BpeTrainer
-    >>> from tokenizers.pre_tokenizers import Whitespace
-
-    >>> tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
-    >>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
-
-    >>> tokenizer.pre_tokenizer = Whitespace()
-    >>> files = [...]
-    >>> tokenizer.train(files, trainer)
-
-We now have a tokenizer trained on the files we defined. We can either continue using it in that runtime, or save it to
-a JSON file for future re-use.
-
-Loading directly from the tokenizer object
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Let's see how to leverage this tokenizer object in the 🤗 Transformers library. The
-:class:`~transformers.PreTrainedTokenizerFast` class allows for easy instantiation, by accepting the instantiated
-`tokenizer` object as an argument:
-
-.. code-block::
-
-    >>> from transformers import PreTrainedTokenizerFast
-
-    >>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
-
-This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to :doc:`the tokenizer
-page <main_classes/tokenizer>` for more information.
-
-Loading from a JSON file
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-In order to load a tokenizer from a JSON file, let's first start by saving our tokenizer:
-
-.. code-block::
-
-    >>> tokenizer.save("tokenizer.json")
-
-The path to which we saved this file can be passed to the :class:`~transformers.PreTrainedTokenizerFast` initialization
-method using the :obj:`tokenizer_file` parameter:
-
-.. code-block::
-
-    >>> from transformers import PreTrainedTokenizerFast
-
-    >>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
-
-This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to :doc:`the tokenizer
-page <main_classes/tokenizer>` for more information.
diff --git a/docs/source/glossary.mdx b/docs/source/glossary.mdx
new file mode 100644
index 0000000000..2685e2082d
--- /dev/null
+++ b/docs/source/glossary.mdx
@@ -0,0 +1,297 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Glossary
+
+## General terms
+
+- autoencoding models: see MLM
+- autoregressive models: see CLM
+- CLM: causal language modeling, a pretraining task where the model reads the texts in order and has to predict the
+  next word. It's usually done by reading the whole sentence but using a mask inside the model to hide the future
+  tokens at a certain timestep.
+- deep learning: machine learning algorithms which uses neural networks with several layers.
+- MLM: masked language modeling, a pretraining task where the model sees a corrupted version of the texts, usually done
+  by masking some tokens randomly, and has to predict the original text.
+- multimodal: a task that combines texts with another kind of inputs (for instance images).
+- NLG: natural language generation, all tasks related to generating text (for instance talk with transformers,
+  translation).
+- NLP: natural language processing, a generic way to say "deal with texts".
+- NLU: natural language understanding, all tasks related to understanding what is in a text (for instance classifying
+  the whole text, individual words).
+- pretrained model: a model that has been pretrained on some data (for instance all of Wikipedia). Pretraining methods
+  involve a self-supervised objective, which can be reading the text and trying to predict the next word (see CLM) or
+  masking some words and trying to predict them (see MLM).
+- RNN: recurrent neural network, a type of model that uses a loop over a layer to process texts.
+- self-attention: each element of the input finds out which other elements of the input they should attend to.
+- seq2seq or sequence-to-sequence: models that generate a new sequence from an input, like translation models, or
+  summarization models (such as [Bart](model_doc/bart) or [T5](model_doc/t5)).
+- token: a part of a sentence, usually a word, but can also be a subword (non-common words are often split in subwords)
+  or a punctuation symbol.
+- transformer: self-attention based deep learning model architecture.
+
+## Model inputs
+
+Every model is different yet bears similarities with the others. Therefore most models use the same inputs, which are
+detailed here alongside usage examples.
+
+<a id='input-ids'></a>
+
+### Input IDs
+
+The input ids are often the only required parameters to be passed to the model as input. *They are token indices,
+numerical representations of tokens building the sequences that will be used as input by the model*.
+
+<Youtube id="VFp38yj8h3A"/>
+
+Each tokenizer works differently but the underlying mechanism remains the same. Here's an example using the BERT
+tokenizer, which is a [WordPiece](https://arxiv.org/pdf/1609.08144.pdf) tokenizer:
+
+```python
+>>> from transformers import BertTokenizer
+>>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+
+>>> sequence = "A Titan RTX has 24GB of VRAM"
+```
+
+The tokenizer takes care of splitting the sequence into tokens available in the tokenizer vocabulary.
+
+```python
+>>> tokenized_sequence = tokenizer.tokenize(sequence)
+```
+
+The tokens are either words or subwords. Here for instance, "VRAM" wasn't in the model vocabulary, so it's been split
+in "V", "RA" and "M". To indicate those tokens are not separate words but parts of the same word, a double-hash prefix
+is added for "RA" and "M":
+
+```python
+>>> print(tokenized_sequence)
+['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
+```
+
+These tokens can then be converted into IDs which are understandable by the model. This can be done by directly feeding
+the sentence to the tokenizer, which leverages the Rust implementation of [🤗 Tokenizers](https://github.com/huggingface/tokenizers) for peak performance.
+
+```python
+>>> inputs = tokenizer(sequence)
+```
+
+The tokenizer returns a dictionary with all the arguments necessary for its corresponding model to work properly. The
+token indices are under the key "input_ids":
+
+```python
+>>> encoded_sequence = inputs["input_ids"]
+>>> print(encoded_sequence)
+[101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
+```
+
+Note that the tokenizer automatically adds "special tokens" (if the associated model relies on them) which are special
+IDs the model sometimes uses.
+
+If we decode the previous sequence of ids,
+
+```python
+>>> decoded_sequence = tokenizer.decode(encoded_sequence)
+```
+
+we will see
+
+```python
+>>> print(decoded_sequence)
+[CLS] A Titan RTX has 24GB of VRAM [SEP]
+```
+
+because this is the way a [`BertModel`] is going to expect its inputs.
+
+<a id='attention-mask'></a>
+
+### Attention mask
+
+The attention mask is an optional argument used when batching sequences together.
+
+<Youtube id="M6adb1j2jPI"/>
+
+This argument indicates to the model which tokens should be attended to, and which should not.
+
+For example, consider these two sequences:
+
+```python
+>>> from transformers import BertTokenizer
+>>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+
+>>> sequence_a = "This is a short sequence."
+>>> sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
+
+>>> encoded_sequence_a = tokenizer(sequence_a)["input_ids"]
+>>> encoded_sequence_b = tokenizer(sequence_b)["input_ids"]
+```
+
+The encoded versions have different lengths:
+
+```python
+>>> len(encoded_sequence_a), len(encoded_sequence_b)
+(8, 19)
+```
+
+Therefore, we can't put them together in the same tensor as-is. The first sequence needs to be padded up to the length
+of the second one, or the second one needs to be truncated down to the length of the first one.
+
+In the first case, the list of IDs will be extended by the padding indices. We can pass a list to the tokenizer and ask
+it to pad like this:
+
+```python
+>>> padded_sequences = tokenizer([sequence_a, sequence_b], padding=True)
+```
+
+We can see that 0s have been added on the right of the first sentence to make it the same length as the second one:
+
+```python
+>>> padded_sequences["input_ids"]
+[[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]]
+```
+
+This can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating the
+position of the padded indices so that the model does not attend to them. For the [`BertTokenizer`],
+`1` indicates a value that should be attended to, while `0` indicates a padded value. This attention mask is
+in the dictionary returned by the tokenizer under the key "attention_mask":
+
+```python
+>>> padded_sequences["attention_mask"]
+[[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
+```
+
+<a id='token-type-ids'></a>
+
+### Token Type IDs
+
+Some models' purpose is to do classification on pairs of sentences or question answering.
+
+<Youtube id="0u3ioSwev3s"/>
+
+These require two different sequences to be joined in a single "input_ids" entry, which usually is performed with the
+help of special tokens, such as the classifier (`[CLS]`) and separator (`[SEP]`) tokens. For example, the BERT
+model builds its two sequence input as such:
+
+```python
+>>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]
+```
+
+We can use our tokenizer to automatically generate such a sentence by passing the two sequences to `tokenizer` as two
+arguments (and not a list, like before) like this:
+
+```python
+>>> from transformers import BertTokenizer
+>>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+>>> sequence_a = "HuggingFace is based in NYC"
+>>> sequence_b = "Where is HuggingFace based?"
+
+>>> encoded_dict = tokenizer(sequence_a, sequence_b)
+>>> decoded = tokenizer.decode(encoded_dict["input_ids"])
+```
+
+which will return:
+
+```python
+>>> print(decoded)
+[CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]
+```
+
+This is enough for some models to understand where one sequence ends and where another begins. However, other models,
+such as BERT, also deploy token type IDs (also called segment IDs). They are represented as a binary mask identifying
+the two types of sequence in the model.
+
+The tokenizer returns this mask as the "token_type_ids" entry:
+
+```python
+>>> encoded_dict['token_type_ids']
+[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+```
+
+The first sequence, the "context" used for the question, has all its tokens represented by a `0`, whereas the
+second sequence, corresponding to the "question", has all its tokens represented by a `1`.
+
+Some models, like [`XLNetModel`] use an additional token represented by a `2`.
+
+<a id='position-ids'></a>
+
+### Position IDs
+
+Contrary to RNNs that have the position of each token embedded within them, transformers are unaware of the position of
+each token. Therefore, the position IDs (`position_ids`) are used by the model to identify each token's position in
+the list of tokens.
+
+They are an optional parameter. If no `position_ids` are passed to the model, the IDs are automatically created as
+absolute positional embeddings.
+
+Absolute positional embeddings are selected in the range `[0, config.max_position_embeddings - 1]`. Some models use
+other types of positional embeddings, such as sinusoidal position embeddings or relative position embeddings.
+
+<a id='labels'></a>
+
+### Labels
+
+The labels are an optional argument which can be passed in order for the model to compute the loss itself. These labels
+should be the expected prediction of the model: it will use the standard loss in order to compute the loss between its
+predictions and the expected value (the label).
+
+These labels are different according to the model head, for example:
+
+- For sequence classification models (e.g., [`BertForSequenceClassification`]), the model expects a
+  tensor of dimension `(batch_size)` with each value of the batch corresponding to the expected label of the
+  entire sequence.
+- For token classification models (e.g., [`BertForTokenClassification`]), the model expects a tensor
+  of dimension `(batch_size, seq_length)` with each value corresponding to the expected label of each individual
+  token.
+- For masked language modeling (e.g., [`BertForMaskedLM`]), the model expects a tensor of dimension
+  `(batch_size, seq_length)` with each value corresponding to the expected label of each individual token: the
+  labels being the token ID for the masked token, and values to be ignored for the rest (usually -100).
+- For sequence to sequence tasks,(e.g., [`BartForConditionalGeneration`],
+  [`MBartForConditionalGeneration`]), the model expects a tensor of dimension `(batch_size, tgt_seq_length)` with each value corresponding to the target sequences associated with each input sequence. During
+  training, both *BART* and *T5* will make the appropriate *decoder_input_ids* and decoder attention masks internally.
+  They usually do not need to be supplied. This does not apply to models leveraging the Encoder-Decoder framework. See
+  the documentation of each model for more information on each specific model's labels.
+
+The base models (e.g., [`BertModel`]) do not accept labels, as these are the base transformer
+models, simply outputting features.
+
+<a id='decoder-input-ids'></a>
+
+### Decoder input IDs
+
+This input is specific to encoder-decoder models, and contains the input IDs that will be fed to the decoder. These
+inputs should be used for sequence to sequence tasks, such as translation or summarization, and are usually built in a
+way specific to each model.
+
+Most encoder-decoder models (BART, T5) create their `decoder_input_ids` on their own from the `labels`. In
+such models, passing the `labels` is the preferred way to handle training.
+
+Please check each model's docs to see how they handle these input IDs for sequence to sequence training.
+
+<a id='feed-forward-chunking'></a>
+
+### Feed Forward Chunking
+
+In each residual attention block in transformers the self-attention layer is usually followed by 2 feed forward layers.
+The intermediate embedding size of the feed forward layers is often bigger than the hidden size of the model (e.g., for
+`bert-base-uncased`).
+
+For an input of size `[batch_size, sequence_length]`, the memory required to store the intermediate feed forward
+embeddings `[batch_size, sequence_length, config.intermediate_size]` can account for a large fraction of the memory
+use. The authors of [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) noticed that since the
+computation is independent of the `sequence_length` dimension, it is mathematically equivalent to compute the output
+embeddings of both feed forward layers `[batch_size, config.hidden_size]_0, ..., [batch_size, config.hidden_size]_n`
+individually and concat them afterward to `[batch_size, sequence_length, config.hidden_size]` with `n = sequence_length`, which trades increased computation time against reduced memory use, but yields a mathematically
+**equivalent** result.
+
+For models employing the function [`apply_chunking_to_forward`], the `chunk_size` defines the
+number of output embeddings that are computed in parallel and thus defines the trade-off between memory and time
+complexity. If `chunk_size` is set to 0, no feed forward chunking is done.
diff --git a/docs/source/glossary.rst b/docs/source/glossary.rst
deleted file mode 100644
index d95ed105cf..0000000000
--- a/docs/source/glossary.rst
+++ /dev/null
@@ -1,322 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Glossary
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-General terms
------------------------------------------------------------------------------------------------------------------------
-
-- autoencoding models: see MLM
-- autoregressive models: see CLM
-- CLM: causal language modeling, a pretraining task where the model reads the texts in order and has to predict the
-  next word. It's usually done by reading the whole sentence but using a mask inside the model to hide the future
-  tokens at a certain timestep.
-- deep learning: machine learning algorithms which uses neural networks with several layers.
-- MLM: masked language modeling, a pretraining task where the model sees a corrupted version of the texts, usually done
-  by masking some tokens randomly, and has to predict the original text.
-- multimodal: a task that combines texts with another kind of inputs (for instance images).
-- NLG: natural language generation, all tasks related to generating text (for instance talk with transformers,
-  translation).
-- NLP: natural language processing, a generic way to say "deal with texts".
-- NLU: natural language understanding, all tasks related to understanding what is in a text (for instance classifying
-  the whole text, individual words).
-- pretrained model: a model that has been pretrained on some data (for instance all of Wikipedia). Pretraining methods
-  involve a self-supervised objective, which can be reading the text and trying to predict the next word (see CLM) or
-  masking some words and trying to predict them (see MLM).
-- RNN: recurrent neural network, a type of model that uses a loop over a layer to process texts.
-- self-attention: each element of the input finds out which other elements of the input they should attend to.
-- seq2seq or sequence-to-sequence: models that generate a new sequence from an input, like translation models, or
-  summarization models (such as :doc:`Bart </model_doc/bart>` or :doc:`T5 </model_doc/t5>`).
-- token: a part of a sentence, usually a word, but can also be a subword (non-common words are often split in subwords)
-  or a punctuation symbol.
-- transformer: self-attention based deep learning model architecture.
-
-Model inputs
------------------------------------------------------------------------------------------------------------------------
-
-Every model is different yet bears similarities with the others. Therefore most models use the same inputs, which are
-detailed here alongside usage examples.
-
-.. _input-ids:
-
-Input IDs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The input ids are often the only required parameters to be passed to the model as input. *They are token indices,
-numerical representations of tokens building the sequences that will be used as input by the model*.
-
-.. raw:: html
-
-   <iframe width="560" height="315" src="https://www.youtube.com/embed/VFp38yj8h3A" title="YouTube video player"
-   frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
-   picture-in-picture" allowfullscreen></iframe>
-
-Each tokenizer works differently but the underlying mechanism remains the same. Here's an example using the BERT
-tokenizer, which is a `WordPiece <https://arxiv.org/pdf/1609.08144.pdf>`__ tokenizer:
-
-.. code-block::
-
-    >>> from transformers import BertTokenizer
-    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
-
-    >>> sequence = "A Titan RTX has 24GB of VRAM"
-
-The tokenizer takes care of splitting the sequence into tokens available in the tokenizer vocabulary.
-
-.. code-block::
-
-    >>> tokenized_sequence = tokenizer.tokenize(sequence)
-
-The tokens are either words or subwords. Here for instance, "VRAM" wasn't in the model vocabulary, so it's been split
-in "V", "RA" and "M". To indicate those tokens are not separate words but parts of the same word, a double-hash prefix
-is added for "RA" and "M":
-
-.. code-block::
-
-    >>> print(tokenized_sequence)
-    ['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
-
-These tokens can then be converted into IDs which are understandable by the model. This can be done by directly feeding
-the sentence to the tokenizer, which leverages the Rust implementation of `huggingface/tokenizers
-<https://github.com/huggingface/tokenizers>`__ for peak performance.
-
-.. code-block::
-
-    >>> inputs = tokenizer(sequence)
-
-The tokenizer returns a dictionary with all the arguments necessary for its corresponding model to work properly. The
-token indices are under the key "input_ids":
-
-.. code-block::
-
-    >>> encoded_sequence = inputs["input_ids"]
-    >>> print(encoded_sequence)
-    [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
-
-Note that the tokenizer automatically adds "special tokens" (if the associated model relies on them) which are special
-IDs the model sometimes uses.
-
-If we decode the previous sequence of ids,
-
-.. code-block::
-
-    >>> decoded_sequence = tokenizer.decode(encoded_sequence)
-
-we will see
-
-.. code-block::
-
-    >>> print(decoded_sequence)
-    [CLS] A Titan RTX has 24GB of VRAM [SEP]
-
-because this is the way a :class:`~transformers.BertModel` is going to expect its inputs.
-
-.. _attention-mask:
-
-Attention mask
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The attention mask is an optional argument used when batching sequences together.
-
-.. raw:: html
-
-   <iframe width="560" height="315" src="https://www.youtube.com/embed/M6adb1j2jPI" title="YouTube video player"
-   frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
-   picture-in-picture" allowfullscreen></iframe>
-
-This argument indicates to the model which tokens should be attended to, and which should not.
-
-For example, consider these two sequences:
-
-.. code-block::
-
-    >>> from transformers import BertTokenizer
-    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
-
-    >>> sequence_a = "This is a short sequence."
-    >>> sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
-
-    >>> encoded_sequence_a = tokenizer(sequence_a)["input_ids"]
-    >>> encoded_sequence_b = tokenizer(sequence_b)["input_ids"]
-
-The encoded versions have different lengths:
-
-.. code-block::
-
-    >>> len(encoded_sequence_a), len(encoded_sequence_b)
-    (8, 19)
-
-Therefore, we can't put them together in the same tensor as-is. The first sequence needs to be padded up to the length
-of the second one, or the second one needs to be truncated down to the length of the first one.
-
-In the first case, the list of IDs will be extended by the padding indices. We can pass a list to the tokenizer and ask
-it to pad like this:
-
-.. code-block::
-
-    >>> padded_sequences = tokenizer([sequence_a, sequence_b], padding=True)
-
-We can see that 0s have been added on the right of the first sentence to make it the same length as the second one:
-
-.. code-block::
-
-    >>> padded_sequences["input_ids"]
-    [[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]]
-
-This can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating the
-position of the padded indices so that the model does not attend to them. For the :class:`~transformers.BertTokenizer`,
-:obj:`1` indicates a value that should be attended to, while :obj:`0` indicates a padded value. This attention mask is
-in the dictionary returned by the tokenizer under the key "attention_mask":
-
-.. code-block::
-
-    >>> padded_sequences["attention_mask"]
-    [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
-
-.. _token-type-ids:
-
-Token Type IDs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Some models' purpose is to do classification on pairs of sentences or question answering.
-
-.. raw:: html
-
-   <iframe width="560" height="315" src="https://www.youtube.com/embed/0u3ioSwev3s" title="YouTube video player"
-   frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
-   picture-in-picture" allowfullscreen></iframe>
-
-These require two different sequences to be joined in a single "input_ids" entry, which usually is performed with the
-help of special tokens, such as the classifier (``[CLS]``) and separator (``[SEP]``) tokens. For example, the BERT
-model builds its two sequence input as such:
-
-.. code-block::
-
-    >>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]
-
-We can use our tokenizer to automatically generate such a sentence by passing the two sequences to ``tokenizer`` as two
-arguments (and not a list, like before) like this:
-
-.. code-block::
-
-    >>> from transformers import BertTokenizer
-    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
-    >>> sequence_a = "HuggingFace is based in NYC"
-    >>> sequence_b = "Where is HuggingFace based?"
-
-    >>> encoded_dict = tokenizer(sequence_a, sequence_b)
-    >>> decoded = tokenizer.decode(encoded_dict["input_ids"])
-
-which will return:
-
-.. code-block::
-
-    >>> print(decoded)
-    [CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]
-
-This is enough for some models to understand where one sequence ends and where another begins. However, other models,
-such as BERT, also deploy token type IDs (also called segment IDs). They are represented as a binary mask identifying
-the two types of sequence in the model.
-
-The tokenizer returns this mask as the "token_type_ids" entry:
-
-.. code-block::
-
-    >>> encoded_dict['token_type_ids']
-    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-
-The first sequence, the "context" used for the question, has all its tokens represented by a :obj:`0`, whereas the
-second sequence, corresponding to the "question", has all its tokens represented by a :obj:`1`.
-
-Some models, like :class:`~transformers.XLNetModel` use an additional token represented by a :obj:`2`.
-
-.. _position-ids:
-
-Position IDs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Contrary to RNNs that have the position of each token embedded within them, transformers are unaware of the position of
-each token. Therefore, the position IDs (``position_ids``) are used by the model to identify each token's position in
-the list of tokens.
-
-They are an optional parameter. If no ``position_ids`` are passed to the model, the IDs are automatically created as
-absolute positional embeddings.
-
-Absolute positional embeddings are selected in the range ``[0, config.max_position_embeddings - 1]``. Some models use
-other types of positional embeddings, such as sinusoidal position embeddings or relative position embeddings.
-
-.. _labels:
-
-Labels
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The labels are an optional argument which can be passed in order for the model to compute the loss itself. These labels
-should be the expected prediction of the model: it will use the standard loss in order to compute the loss between its
-predictions and the expected value (the label).
-
-These labels are different according to the model head, for example:
-
-- For sequence classification models (e.g., :class:`~transformers.BertForSequenceClassification`), the model expects a
-  tensor of dimension :obj:`(batch_size)` with each value of the batch corresponding to the expected label of the
-  entire sequence.
-- For token classification models (e.g., :class:`~transformers.BertForTokenClassification`), the model expects a tensor
-  of dimension :obj:`(batch_size, seq_length)` with each value corresponding to the expected label of each individual
-  token.
-- For masked language modeling (e.g., :class:`~transformers.BertForMaskedLM`), the model expects a tensor of dimension
-  :obj:`(batch_size, seq_length)` with each value corresponding to the expected label of each individual token: the
-  labels being the token ID for the masked token, and values to be ignored for the rest (usually -100).
-- For sequence to sequence tasks,(e.g., :class:`~transformers.BartForConditionalGeneration`,
-  :class:`~transformers.MBartForConditionalGeneration`), the model expects a tensor of dimension :obj:`(batch_size,
-  tgt_seq_length)` with each value corresponding to the target sequences associated with each input sequence. During
-  training, both `BART` and `T5` will make the appropriate `decoder_input_ids` and decoder attention masks internally.
-  They usually do not need to be supplied. This does not apply to models leveraging the Encoder-Decoder framework. See
-  the documentation of each model for more information on each specific model's labels.
-
-The base models (e.g., :class:`~transformers.BertModel`) do not accept labels, as these are the base transformer
-models, simply outputting features.
-
-.. _decoder-input-ids:
-
-Decoder input IDs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This input is specific to encoder-decoder models, and contains the input IDs that will be fed to the decoder. These
-inputs should be used for sequence to sequence tasks, such as translation or summarization, and are usually built in a
-way specific to each model.
-
-Most encoder-decoder models (BART, T5) create their :obj:`decoder_input_ids` on their own from the :obj:`labels`. In
-such models, passing the :obj:`labels` is the preferred way to handle training.
-
-Please check each model's docs to see how they handle these input IDs for sequence to sequence training.
-
-.. _feed-forward-chunking:
-
-Feed Forward Chunking
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-In each residual attention block in transformers the self-attention layer is usually followed by 2 feed forward layers.
-The intermediate embedding size of the feed forward layers is often bigger than the hidden size of the model (e.g., for
-``bert-base-uncased``).
-
-For an input of size ``[batch_size, sequence_length]``, the memory required to store the intermediate feed forward
-embeddings ``[batch_size, sequence_length, config.intermediate_size]`` can account for a large fraction of the memory
-use. The authors of `Reformer: The Efficient Transformer <https://arxiv.org/abs/2001.04451>`_ noticed that since the
-computation is independent of the ``sequence_length`` dimension, it is mathematically equivalent to compute the output
-embeddings of both feed forward layers ``[batch_size, config.hidden_size]_0, ..., [batch_size, config.hidden_size]_n``
-individually and concat them afterward to ``[batch_size, sequence_length, config.hidden_size]`` with ``n =
-sequence_length``, which trades increased computation time against reduced memory use, but yields a mathematically
-**equivalent** result.
-
-For models employing the function :func:`~.transformers.apply_chunking_to_forward`, the ``chunk_size`` defines the
-number of output embeddings that are computed in parallel and thus defines the trade-off between memory and time
-complexity. If ``chunk_size`` is set to 0, no feed forward chunking is done.
diff --git a/docs/source/installation.md b/docs/source/installation.mdx
similarity index 100%
rename from docs/source/installation.md
rename to docs/source/installation.mdx
diff --git a/docs/source/internal/file_utils.mdx b/docs/source/internal/file_utils.mdx
new file mode 100644
index 0000000000..d0d568ce79
--- /dev/null
+++ b/docs/source/internal/file_utils.mdx
@@ -0,0 +1,46 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# General Utilities
+
+This page lists all of Transformers general utility functions that are found in the file `file_utils.py`.
+
+Most of those are only useful if you are studying the general code in the library.
+
+
+## Enums and namedtuples
+
+[[autodoc]] file_utils.ExplicitEnum
+
+[[autodoc]] file_utils.PaddingStrategy
+
+[[autodoc]] file_utils.TensorType
+
+## Special Decorators
+
+[[autodoc]] file_utils.add_start_docstrings
+
+[[autodoc]] file_utils.add_start_docstrings_to_model_forward
+
+[[autodoc]] file_utils.add_end_docstrings
+
+[[autodoc]] file_utils.add_code_sample_docstrings
+
+[[autodoc]] file_utils.replace_return_docstrings
+
+## Special Properties
+
+[[autodoc]] file_utils.cached_property
+
+## Other Utilities
+
+[[autodoc]] file_utils._LazyModule
diff --git a/docs/source/internal/file_utils.rst b/docs/source/internal/file_utils.rst
deleted file mode 100644
index 361ef637e6..0000000000
--- a/docs/source/internal/file_utils.rst
+++ /dev/null
@@ -1,54 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-General Utilities
------------------------------------------------------------------------------------------------------------------------
-
-This page lists all of Transformers general utility functions that are found in the file ``file_utils.py``.
-
-Most of those are only useful if you are studying the general code in the library.
-
-
-Enums and namedtuples
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.file_utils.ExplicitEnum
-
-.. autoclass:: transformers.file_utils.PaddingStrategy
-
-.. autoclass:: transformers.file_utils.TensorType
-
-
-Special Decorators
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: transformers.file_utils.add_start_docstrings
-
-.. autofunction:: transformers.file_utils.add_start_docstrings_to_model_forward
-
-.. autofunction:: transformers.file_utils.add_end_docstrings
-
-.. autofunction:: transformers.file_utils.add_code_sample_docstrings
-
-.. autofunction:: transformers.file_utils.replace_return_docstrings
-
-
-Special Properties
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.file_utils.cached_property
-
-
-Other Utilities
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.file_utils._LazyModule
diff --git a/docs/source/internal/generation_utils.mdx b/docs/source/internal/generation_utils.mdx
new file mode 100644
index 0000000000..e1f844254f
--- /dev/null
+++ b/docs/source/internal/generation_utils.mdx
@@ -0,0 +1,207 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Utilities for Generation
+
+This page lists all the utility functions used by [`~generation_utils.GenerationMixin.generate`],
+[`~generation_utils.GenerationMixin.greedy_search`],
+[`~generation_utils.GenerationMixin.sample`],
+[`~generation_utils.GenerationMixin.beam_search`],
+[`~generation_utils.GenerationMixin.beam_sample`], and
+[`~generation_utils.GenerationMixin.group_beam_search`].
+
+Most of those are only useful if you are studying the code of the generate methods in the library.
+
+## Generate Outputs
+
+The output of [`~generation_utils.GenerationMixin.generate`] is an instance of a subclass of
+[`~file_utils.ModelOutput`]. This output is a data structure containing all the information returned
+by [`~generation_utils.GenerationMixin.generate`], but that can also be used as tuple or dictionary.
+
+Here's an example:
+
+```python
+from transformers import GPT2Tokenizer, GPT2LMHeadModel
+
+tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+model = GPT2LMHeadModel.from_pretrained('gpt2')
+
+inputs = tokenizer("Hello, my dog is cute and ", return_tensors="pt")
+generation_output = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
+```
+
+The `generation_output` object is a [`~generation_utils.GreedySearchDecoderOnlyOutput`], as we can
+see in the documentation of that class below, it means it has the following attributes:
+
+- `sequences`: the generated sequences of tokens
+- `scores` (optional): the prediction scores of the language modelling head, for each generation step
+- `hidden_states` (optional): the hidden states of the model, for each generation step
+- `attentions` (optional): the attention weights of the model, for each generation step
+
+Here we have the `scores` since we passed along `output_scores=True`, but we don't have `hidden_states` and
+`attentions` because we didn't pass `output_hidden_states=True` or `output_attentions=True`.
+
+You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you
+will get `None`. Here for instance `generation_output.scores` are all the generated prediction scores of the
+language modeling head, and `generation_output.attentions` is `None`.
+
+When using our `generation_output` object as a tuple, it only keeps the attributes that don't have `None` values.
+Here, for instance, it has two elements, `loss` then `logits`, so
+
+```python
+generation_output[:2]
+```
+
+will return the tuple `(generation_output.sequences, generation_output.scores)` for instance.
+
+When using our `generation_output` object as a dictionary, it only keeps the attributes that don't have `None`
+values. Here, for instance, it has two keys that are `sequences` and `scores`.
+
+We document here all output types.
+
+
+### GreedySearchOutput
+
+[[autodoc]] generation_utils.GreedySearchDecoderOnlyOutput
+
+[[autodoc]] generation_utils.GreedySearchEncoderDecoderOutput
+
+[[autodoc]] generation_flax_utils.FlaxGreedySearchOutput
+
+### SampleOutput
+
+[[autodoc]] generation_utils.SampleDecoderOnlyOutput
+
+[[autodoc]] generation_utils.SampleEncoderDecoderOutput
+
+[[autodoc]] generation_flax_utils.FlaxSampleOutput
+
+### BeamSearchOutput
+
+[[autodoc]] generation_utils.BeamSearchDecoderOnlyOutput
+
+[[autodoc]] generation_utils.BeamSearchEncoderDecoderOutput
+
+### BeamSampleOutput
+
+[[autodoc]] generation_utils.BeamSampleDecoderOnlyOutput
+
+[[autodoc]] generation_utils.BeamSampleEncoderDecoderOutput
+
+## LogitsProcessor
+
+A [`LogitsProcessor`] can be used to modify the prediction scores of a language model head for
+generation.
+
+[[autodoc]] LogitsProcessor
+    - __call__
+
+[[autodoc]] LogitsProcessorList
+    - __call__
+
+[[autodoc]] LogitsWarper
+    - __call__
+
+[[autodoc]] MinLengthLogitsProcessor
+    - __call__
+
+[[autodoc]] TemperatureLogitsWarper
+    - __call__
+
+[[autodoc]] RepetitionPenaltyLogitsProcessor
+    - __call__
+
+[[autodoc]] TopPLogitsWarper
+    - __call__
+
+[[autodoc]] TopKLogitsWarper
+    - __call__
+
+[[autodoc]] NoRepeatNGramLogitsProcessor
+    - __call__
+
+[[autodoc]] NoBadWordsLogitsProcessor
+    - __call__
+
+[[autodoc]] PrefixConstrainedLogitsProcessor
+    - __call__
+
+[[autodoc]] HammingDiversityLogitsProcessor
+    - __call__
+
+[[autodoc]] ForcedBOSTokenLogitsProcessor
+    - __call__
+
+[[autodoc]] ForcedEOSTokenLogitsProcessor
+    - __call__
+
+[[autodoc]] InfNanRemoveLogitsProcessor
+    - __call__
+
+[[autodoc]] FlaxLogitsProcessor
+    - __call__
+
+[[autodoc]] FlaxLogitsProcessorList
+    - __call__
+
+[[autodoc]] FlaxLogitsWarper
+    - __call__
+
+[[autodoc]] FlaxTemperatureLogitsWarper
+    - __call__
+
+[[autodoc]] FlaxTopPLogitsWarper
+    - __call__
+
+[[autodoc]] FlaxTopKLogitsWarper
+    - __call__
+
+[[autodoc]] FlaxForcedBOSTokenLogitsProcessor
+    - __call__
+
+[[autodoc]] FlaxForcedEOSTokenLogitsProcessor
+    - __call__
+
+[[autodoc]] FlaxMinLengthLogitsProcessor
+    - __call__
+
+## StoppingCriteria
+
+A [`StoppingCriteria`] can be used to change when to stop generation (other than EOS token).
+
+[[autodoc]] StoppingCriteria
+    - __call__
+
+[[autodoc]] StoppingCriteriaList
+    - __call__
+
+[[autodoc]] MaxLengthCriteria
+    - __call__
+
+[[autodoc]] MaxTimeCriteria
+    - __call__
+
+## BeamSearch
+
+[[autodoc]] BeamScorer
+    - process
+    - finalize
+
+[[autodoc]] BeamSearchScorer
+    - process
+    - finalize
+
+## Utilities
+
+[[autodoc]] top_k_top_p_filtering
+
+[[autodoc]] tf_top_k_top_p_filtering
diff --git a/docs/source/internal/generation_utils.rst b/docs/source/internal/generation_utils.rst
deleted file mode 100644
index 00dff63d98..0000000000
--- a/docs/source/internal/generation_utils.rst
+++ /dev/null
@@ -1,230 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Utilities for Generation
------------------------------------------------------------------------------------------------------------------------
-
-This page lists all the utility functions used by :meth:`~transformers.generation_utils.GenerationMixin.generate`,
-:meth:`~transformers.generation_utils.GenerationMixin.greedy_search`,
-:meth:`~transformers.generation_utils.GenerationMixin.sample`,
-:meth:`~transformers.generation_utils.GenerationMixin.beam_search`,
-:meth:`~transformers.generation_utils.GenerationMixin.beam_sample`, and
-:meth:`~transformers.generation_utils.GenerationMixin.group_beam_search`.
-
-Most of those are only useful if you are studying the code of the generate methods in the library.
-
-Generate Outputs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The output of :meth:`~transformers.generation_utils.GenerationMixin.generate` is an instance of a subclass of
-:class:`~transformers.file_utils.ModelOutput`. This output is a data structure containing all the information returned
-by :meth:`~transformers.generation_utils.GenerationMixin.generate`, but that can also be used as tuple or dictionary.
-
-Here's an example:
-
-.. code-block::
-
-    from transformers import GPT2Tokenizer, GPT2LMHeadModel
-
-    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-    model = GPT2LMHeadModel.from_pretrained('gpt2')
-
-    inputs = tokenizer("Hello, my dog is cute and ", return_tensors="pt")
-    generation_output = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
-
-The ``generation_output`` object is a :class:`~transformers.generation_utils.GreedySearchDecoderOnlyOutput`, as we can
-see in the documentation of that class below, it means it has the following attributes:
-
-- ``sequences``: the generated sequences of tokens
-- ``scores`` (optional): the prediction scores of the language modelling head, for each generation step
-- ``hidden_states`` (optional): the hidden states of the model, for each generation step
-- ``attentions`` (optional): the attention weights of the model, for each generation step
-
-Here we have the ``scores`` since we passed along ``output_scores=True``, but we don't have ``hidden_states`` and
-``attentions`` because we didn't pass ``output_hidden_states=True`` or ``output_attentions=True``.
-
-You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you
-will get ``None``. Here for instance ``generation_output.scores`` are all the generated prediction scores of the
-language modeling head, and ``generation_output.attentions`` is ``None``.
-
-When using our ``generation_output`` object as a tuple, it only keeps the attributes that don't have ``None`` values.
-Here, for instance, it has two elements, ``loss`` then ``logits``, so
-
-.. code-block::
-
-    generation_output[:2]
-
-will return the tuple ``(generation_output.sequences, generation_output.scores)`` for instance.
-
-When using our ``generation_output`` object as a dictionary, it only keeps the attributes that don't have ``None``
-values. Here, for instance, it has two keys that are ``sequences`` and ``scores``.
-
-We document here all output types.
-
-
-GreedySearchOutput
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. autoclass:: transformers.generation_utils.GreedySearchDecoderOnlyOutput
-    :members:
-
-.. autoclass:: transformers.generation_utils.GreedySearchEncoderDecoderOutput
-    :members:
-
-.. autoclass:: transformers.generation_flax_utils.FlaxGreedySearchOutput
-    :members:
-
-
-SampleOutput
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. autoclass:: transformers.generation_utils.SampleDecoderOnlyOutput
-    :members:
-
-.. autoclass:: transformers.generation_utils.SampleEncoderDecoderOutput
-    :members:
-
-.. autoclass:: transformers.generation_flax_utils.FlaxSampleOutput
-    :members:
-
-
-BeamSearchOutput
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. autoclass:: transformers.generation_utils.BeamSearchDecoderOnlyOutput
-    :members:
-
-.. autoclass:: transformers.generation_utils.BeamSearchEncoderDecoderOutput
-    :members:
-
-
-BeamSampleOutput
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. autoclass:: transformers.generation_utils.BeamSampleDecoderOnlyOutput
-    :members:
-
-.. autoclass:: transformers.generation_utils.BeamSampleEncoderDecoderOutput
-    :members:
-
-
-LogitsProcessor
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-A :class:`~transformers.LogitsProcessor` can be used to modify the prediction scores of a language model head for
-generation.
-
-.. autoclass:: transformers.LogitsProcessor
-    :members: __call__
-
-.. autoclass:: transformers.LogitsProcessorList
-    :members: __call__
-
-.. autoclass:: transformers.LogitsWarper
-    :members: __call__
-
-.. autoclass:: transformers.MinLengthLogitsProcessor
-    :members: __call__
-
-.. autoclass:: transformers.TemperatureLogitsWarper
-    :members: __call__
-
-.. autoclass:: transformers.RepetitionPenaltyLogitsProcessor
-    :members: __call__
-
-.. autoclass:: transformers.TopPLogitsWarper
-    :members: __call__
-
-.. autoclass:: transformers.TopKLogitsWarper
-    :members: __call__
-
-.. autoclass:: transformers.NoRepeatNGramLogitsProcessor
-    :members: __call__
-
-.. autoclass:: transformers.NoBadWordsLogitsProcessor
-    :members: __call__
-
-.. autoclass:: transformers.PrefixConstrainedLogitsProcessor
-    :members: __call__
-
-.. autoclass:: transformers.HammingDiversityLogitsProcessor
-    :members: __call__
-
-.. autoclass:: transformers.ForcedBOSTokenLogitsProcessor
-    :members: __call__
-
-.. autoclass:: transformers.ForcedEOSTokenLogitsProcessor
-    :members: __call__
-
-.. autoclass:: transformers.InfNanRemoveLogitsProcessor
-    :members: __call__
-
-.. autoclass:: transformers.FlaxLogitsProcessor
-    :members: __call__
-
-.. autoclass:: transformers.FlaxLogitsProcessorList
-    :members: __call__
-
-.. autoclass:: transformers.FlaxLogitsWarper
-    :members: __call__
-
-.. autoclass:: transformers.FlaxTemperatureLogitsWarper
-    :members: __call__
-
-.. autoclass:: transformers.FlaxTopPLogitsWarper
-    :members: __call__
-
-.. autoclass:: transformers.FlaxTopKLogitsWarper
-    :members: __call__
-
-.. autoclass:: transformers.FlaxForcedBOSTokenLogitsProcessor
-    :members: __call__
-
-.. autoclass:: transformers.FlaxForcedEOSTokenLogitsProcessor
-    :members: __call__
-
-.. autoclass:: transformers.FlaxMinLengthLogitsProcessor
-    :members: __call__
-
-
-StoppingCriteria
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-A :class:`~transformers.StoppingCriteria` can be used to change when to stop generation (other than EOS token).
-
-.. autoclass:: transformers.StoppingCriteria
-    :members: __call__
-
-.. autoclass:: transformers.StoppingCriteriaList
-    :members: __call__
-
-.. autoclass:: transformers.MaxLengthCriteria
-    :members: __call__
-
-.. autoclass:: transformers.MaxTimeCriteria
-    :members: __call__
-
-BeamSearch
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BeamScorer
-    :members: process, finalize
-
-.. autoclass:: transformers.BeamSearchScorer
-    :members: process, finalize
-
-Utilities
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: transformers.top_k_top_p_filtering
-
-.. autofunction:: transformers.tf_top_k_top_p_filtering
diff --git a/docs/source/internal/modeling_utils.mdx b/docs/source/internal/modeling_utils.mdx
new file mode 100644
index 0000000000..af4fdf72a8
--- /dev/null
+++ b/docs/source/internal/modeling_utils.mdx
@@ -0,0 +1,82 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Custom Layers and Utilities
+
+This page lists all the custom layers used by the library, as well as the utility functions it provides for modeling.
+
+Most of those are only useful if you are studying the code of the models in the library.
+
+
+## Pytorch custom modules
+
+[[autodoc]] modeling_utils.Conv1D
+
+[[autodoc]] modeling_utils.PoolerStartLogits
+    - forward
+
+[[autodoc]] modeling_utils.PoolerEndLogits
+    - forward
+
+[[autodoc]] modeling_utils.PoolerAnswerClass
+    - forward
+
+[[autodoc]] modeling_utils.SquadHeadOutput
+
+[[autodoc]] modeling_utils.SQuADHead
+    - forward
+
+[[autodoc]] modeling_utils.SequenceSummary
+    - forward
+
+## PyTorch Helper Functions
+
+[[autodoc]] apply_chunking_to_forward
+
+[[autodoc]] modeling_utils.find_pruneable_heads_and_indices
+
+[[autodoc]] modeling_utils.prune_layer
+
+[[autodoc]] modeling_utils.prune_conv1d_layer
+
+[[autodoc]] modeling_utils.prune_linear_layer
+
+## TensorFlow custom layers
+
+[[autodoc]] modeling_tf_utils.TFConv1D
+
+[[autodoc]] modeling_tf_utils.TFSharedEmbeddings
+    - call
+
+[[autodoc]] modeling_tf_utils.TFSequenceSummary
+
+## TensorFlow loss functions
+
+[[autodoc]] modeling_tf_utils.TFCausalLanguageModelingLoss
+
+[[autodoc]] modeling_tf_utils.TFMaskedLanguageModelingLoss
+
+[[autodoc]] modeling_tf_utils.TFMultipleChoiceLoss
+
+[[autodoc]] modeling_tf_utils.TFQuestionAnsweringLoss
+
+[[autodoc]] modeling_tf_utils.TFSequenceClassificationLoss
+
+[[autodoc]] modeling_tf_utils.TFTokenClassificationLoss
+
+## TensorFlow Helper Functions
+
+[[autodoc]] modeling_tf_utils.get_initializer
+
+[[autodoc]] modeling_tf_utils.keras_serializable
+
+[[autodoc]] modeling_tf_utils.shape_list
diff --git a/docs/source/internal/modeling_utils.rst b/docs/source/internal/modeling_utils.rst
deleted file mode 100644
index 879f81058a..0000000000
--- a/docs/source/internal/modeling_utils.rst
+++ /dev/null
@@ -1,97 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Custom Layers and Utilities
------------------------------------------------------------------------------------------------------------------------
-
-This page lists all the custom layers used by the library, as well as the utility functions it provides for modeling.
-
-Most of those are only useful if you are studying the code of the models in the library.
-
-
-Pytorch custom modules
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_utils.Conv1D
-
-.. autoclass:: transformers.modeling_utils.PoolerStartLogits
-    :members: forward
-
-.. autoclass:: transformers.modeling_utils.PoolerEndLogits
-    :members: forward
-
-.. autoclass:: transformers.modeling_utils.PoolerAnswerClass
-    :members: forward
-
-.. autoclass:: transformers.modeling_utils.SquadHeadOutput
-
-.. autoclass:: transformers.modeling_utils.SQuADHead
-    :members: forward
-
-.. autoclass:: transformers.modeling_utils.SequenceSummary
-    :members: forward
-
-
-PyTorch Helper Functions
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: transformers.apply_chunking_to_forward
-
-.. autofunction:: transformers.modeling_utils.find_pruneable_heads_and_indices
-
-.. autofunction:: transformers.modeling_utils.prune_layer
-
-.. autofunction:: transformers.modeling_utils.prune_conv1d_layer
-
-.. autofunction:: transformers.modeling_utils.prune_linear_layer
-
-TensorFlow custom layers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_tf_utils.TFConv1D
-
-.. autoclass:: transformers.modeling_tf_utils.TFSharedEmbeddings
-    :members: call
-
-.. autoclass:: transformers.modeling_tf_utils.TFSequenceSummary
-
-
-TensorFlow loss functions
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_tf_utils.TFCausalLanguageModelingLoss
-    :members:
-
-.. autoclass:: transformers.modeling_tf_utils.TFMaskedLanguageModelingLoss
-    :members:
-
-.. autoclass:: transformers.modeling_tf_utils.TFMultipleChoiceLoss
-    :members:
-
-.. autoclass:: transformers.modeling_tf_utils.TFQuestionAnsweringLoss
-    :members:
-
-.. autoclass:: transformers.modeling_tf_utils.TFSequenceClassificationLoss
-    :members:
-
-.. autoclass:: transformers.modeling_tf_utils.TFTokenClassificationLoss
-    :members:
-
-
-TensorFlow Helper Functions
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: transformers.modeling_tf_utils.get_initializer
-
-.. autofunction:: transformers.modeling_tf_utils.keras_serializable
-
-.. autofunction:: transformers.modeling_tf_utils.shape_list
diff --git a/docs/source/internal/pipelines_utils.mdx b/docs/source/internal/pipelines_utils.mdx
new file mode 100644
index 0000000000..ed8e75b414
--- /dev/null
+++ b/docs/source/internal/pipelines_utils.mdx
@@ -0,0 +1,40 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Utilities for pipelines
+
+This page lists all the utility functions the library provides for pipelines.
+
+Most of those are only useful if you are studying the code of the models in the library.
+
+
+## Argument handling
+
+[[autodoc]] pipelines.ArgumentHandler
+
+[[autodoc]] pipelines.ZeroShotClassificationArgumentHandler
+
+[[autodoc]] pipelines.QuestionAnsweringArgumentHandler
+
+## Data format
+
+[[autodoc]] pipelines.PipelineDataFormat
+
+[[autodoc]] pipelines.CsvPipelineDataFormat
+
+[[autodoc]] pipelines.JsonPipelineDataFormat
+
+[[autodoc]] pipelines.PipedPipelineDataFormat
+
+## Utilities
+
+[[autodoc]] pipelines.PipelineException
diff --git a/docs/source/internal/pipelines_utils.rst b/docs/source/internal/pipelines_utils.rst
deleted file mode 100644
index e2181a6550..0000000000
--- a/docs/source/internal/pipelines_utils.rst
+++ /dev/null
@@ -1,50 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Utilities for pipelines
------------------------------------------------------------------------------------------------------------------------
-
-This page lists all the utility functions the library provides for pipelines.
-
-Most of those are only useful if you are studying the code of the models in the library.
-
-
-Argument handling
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.pipelines.ArgumentHandler
-
-.. autoclass:: transformers.pipelines.ZeroShotClassificationArgumentHandler
-
-.. autoclass:: transformers.pipelines.QuestionAnsweringArgumentHandler
-
-
-Data format
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.pipelines.PipelineDataFormat
-    :members:
-
-.. autoclass:: transformers.pipelines.CsvPipelineDataFormat
-    :members:
-
-.. autoclass:: transformers.pipelines.JsonPipelineDataFormat
-    :members:
-
-.. autoclass:: transformers.pipelines.PipedPipelineDataFormat
-    :members:
-
-
-Utilities
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.pipelines.PipelineException
diff --git a/docs/source/internal/tokenization_utils.mdx b/docs/source/internal/tokenization_utils.mdx
new file mode 100644
index 0000000000..24e81f7020
--- /dev/null
+++ b/docs/source/internal/tokenization_utils.mdx
@@ -0,0 +1,38 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Utilities for Tokenizers
+
+This page lists all the utility functions used by the tokenizers, mainly the class
+[`~tokenization_utils_base.PreTrainedTokenizerBase`] that implements the common methods between
+[`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`] and the mixin
+[`~tokenization_utils_base.SpecialTokensMixin`].
+
+Most of those are only useful if you are studying the code of the tokenizers in the library.
+
+## PreTrainedTokenizerBase
+
+[[autodoc]] tokenization_utils_base.PreTrainedTokenizerBase
+    - __call__
+    - all
+
+## SpecialTokensMixin
+
+[[autodoc]] tokenization_utils_base.SpecialTokensMixin
+
+## Enums and namedtuples
+
+[[autodoc]] tokenization_utils_base.TruncationStrategy
+
+[[autodoc]] tokenization_utils_base.CharSpan
+
+[[autodoc]] tokenization_utils_base.TokenSpan
diff --git a/docs/source/internal/tokenization_utils.rst b/docs/source/internal/tokenization_utils.rst
deleted file mode 100644
index 4198c552c8..0000000000
--- a/docs/source/internal/tokenization_utils.rst
+++ /dev/null
@@ -1,45 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Utilities for Tokenizers
------------------------------------------------------------------------------------------------------------------------
-
-This page lists all the utility functions used by the tokenizers, mainly the class
-:class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase` that implements the common methods between
-:class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast` and the mixin
-:class:`~transformers.tokenization_utils_base.SpecialTokensMixin`.
-
-Most of those are only useful if you are studying the code of the tokenizers in the library.
-
-PreTrainedTokenizerBase
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.tokenization_utils_base.PreTrainedTokenizerBase
-    :special-members: __call__
-    :members:
-
-
-SpecialTokensMixin
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.tokenization_utils_base.SpecialTokensMixin
-    :members:
-
-
-Enums and namedtuples
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.tokenization_utils_base.TruncationStrategy
-
-.. autoclass:: transformers.tokenization_utils_base.CharSpan
-
-.. autoclass:: transformers.tokenization_utils_base.TokenSpan
diff --git a/docs/source/internal/trainer_utils.mdx b/docs/source/internal/trainer_utils.mdx
new file mode 100644
index 0000000000..054bd69b44
--- /dev/null
+++ b/docs/source/internal/trainer_utils.mdx
@@ -0,0 +1,43 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Utilities for Trainer
+
+This page lists all the utility functions used by [`Trainer`].
+
+Most of those are only useful if you are studying the code of the Trainer in the library.
+
+## Utilities
+
+[[autodoc]] EvalPrediction
+
+[[autodoc]] IntervalStrategy
+
+[[autodoc]] set_seed
+
+[[autodoc]] torch_distributed_zero_first
+
+## Callbacks internals
+
+[[autodoc]] trainer_callback.CallbackHandler
+
+## Distributed Evaluation
+
+[[autodoc]] trainer_pt_utils.DistributedTensorGatherer
+
+## Distributed Evaluation
+
+[[autodoc]] HfArgumentParser
+
+## Debug Utilities
+
+[[autodoc]] debug_utils.DebugUnderflowOverflow
diff --git a/docs/source/internal/trainer_utils.rst b/docs/source/internal/trainer_utils.rst
deleted file mode 100644
index 65720d15ba..0000000000
--- a/docs/source/internal/trainer_utils.rst
+++ /dev/null
@@ -1,54 +0,0 @@
-..
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Utilities for Trainer
------------------------------------------------------------------------------------------------------------------------
-
-This page lists all the utility functions used by :class:`~transformers.Trainer`.
-
-Most of those are only useful if you are studying the code of the Trainer in the library.
-
-Utilities
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.EvalPrediction
-
-.. autoclass:: transformers.IntervalStrategy
-
-.. autofunction:: transformers.set_seed
-
-.. autofunction:: transformers.torch_distributed_zero_first
-
-
-Callbacks internals
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.trainer_callback.CallbackHandler
-
-
-Distributed Evaluation
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.trainer_pt_utils.DistributedTensorGatherer
-    :members:
-
-
-Distributed Evaluation
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.HfArgumentParser
-
-
-Debug Utilities
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.debug_utils.DebugUnderflowOverflow
diff --git a/docs/source/main_classes/callback.mdx b/docs/source/main_classes/callback.mdx
new file mode 100644
index 0000000000..9847be9357
--- /dev/null
+++ b/docs/source/main_classes/callback.mdx
@@ -0,0 +1,106 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Callbacks
+
+Callbacks are objects that can customize the behavior of the training loop in the PyTorch
+[`Trainer`] (this feature is not yet implemented in TensorFlow) that can inspect the training loop
+state (for progress reporting, logging on TensorBoard or other ML platforms...) and take decisions (like early
+stopping).
+
+Callbacks are "read only" pieces of code, apart from the [`TrainerControl`] object they return, they
+cannot change anything in the training loop. For customizations that require changes in the training loop, you should
+subclass [`Trainer`] and override the methods you need (see [trainer](trainer) for examples).
+
+By default a [`Trainer`] will use the following callbacks:
+
+- [`DefaultFlowCallback`] which handles the default behavior for logging, saving and evaluation.
+- [`PrinterCallback`] or [`ProgressCallback`] to display progress and print the
+  logs (the first one is used if you deactivate tqdm through the [`TrainingArguments`], otherwise
+  it's the second one).
+- [`~integrations.TensorBoardCallback`] if tensorboard is accessible (either through PyTorch >= 1.4
+  or tensorboardX).
+- [`~integrations.WandbCallback`] if [wandb](https://www.wandb.com/) is installed.
+- [`~integrations.CometCallback`] if [comet_ml](https://www.comet.ml/site/) is installed.
+- [`~integrations.MLflowCallback`] if [mlflow](https://www.mlflow.org/) is installed.
+- [`~integrations.AzureMLCallback`] if [azureml-sdk](https://pypi.org/project/azureml-sdk/) is
+  installed.
+
+The main class that implements callbacks is [`TrainerCallback`]. It gets the
+[`TrainingArguments`] used to instantiate the [`Trainer`], can access that
+Trainer's internal state via [`TrainerState`], and can take some actions on the training loop via
+[`TrainerControl`].
+
+
+## Available Callbacks
+
+Here is the list of the available [`TrainerCallback`] in the library:
+
+[[autodoc]] integrations.CometCallback
+    - setup
+
+[[autodoc]] DefaultFlowCallback
+
+[[autodoc]] PrinterCallback
+
+[[autodoc]] ProgressCallback
+
+[[autodoc]] EarlyStoppingCallback
+
+[[autodoc]] integrations.TensorBoardCallback
+
+[[autodoc]] integrations.WandbCallback
+    - setup
+
+[[autodoc]] integrations.MLflowCallback
+    - setup
+
+[[autodoc]] integrations.AzureMLCallback
+
+## TrainerCallback
+
+[[autodoc]] TrainerCallback
+
+Here is an example of how to register a custom callback with the PyTorch [`Trainer`]:
+
+```python
+class MyCallback(TrainerCallback):
+    "A callback that prints a message at the beginning of training"
+
+    def on_train_begin(self, args, state, control, **kwargs):
+        print("Starting training")
+
+trainer = Trainer(
+    model,
+    args,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset,
+    callbacks=[MyCallback]  # We can either pass the callback class this way or an instance of it (MyCallback())
+)
+```
+
+Another way to register a callback is to call `trainer.add_callback()` as follows:
+
+```python
+trainer = Trainer(...)
+trainer.add_callback(MyCallback)
+# Alternatively, we can pass an instance of the callback class
+trainer.add_callback(MyCallback())
+```
+
+## TrainerState
+
+[[autodoc]] TrainerState
+
+## TrainerControl
+
+[[autodoc]] TrainerControl
diff --git a/docs/source/main_classes/callback.rst b/docs/source/main_classes/callback.rst
deleted file mode 100644
index 3a7934bdce..0000000000
--- a/docs/source/main_classes/callback.rst
+++ /dev/null
@@ -1,115 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Callbacks
------------------------------------------------------------------------------------------------------------------------
-
-Callbacks are objects that can customize the behavior of the training loop in the PyTorch
-:class:`~transformers.Trainer` (this feature is not yet implemented in TensorFlow) that can inspect the training loop
-state (for progress reporting, logging on TensorBoard or other ML platforms...) and take decisions (like early
-stopping).
-
-Callbacks are "read only" pieces of code, apart from the :class:`~transformers.TrainerControl` object they return, they
-cannot change anything in the training loop. For customizations that require changes in the training loop, you should
-subclass :class:`~transformers.Trainer` and override the methods you need (see :doc:`trainer` for examples).
-
-By default a :class:`~transformers.Trainer` will use the following callbacks:
-
-- :class:`~transformers.DefaultFlowCallback` which handles the default behavior for logging, saving and evaluation.
-- :class:`~transformers.PrinterCallback` or :class:`~transformers.ProgressCallback` to display progress and print the
-  logs (the first one is used if you deactivate tqdm through the :class:`~transformers.TrainingArguments`, otherwise
-  it's the second one).
-- :class:`~transformers.integrations.TensorBoardCallback` if tensorboard is accessible (either through PyTorch >= 1.4
-  or tensorboardX).
-- :class:`~transformers.integrations.WandbCallback` if `wandb <https://www.wandb.com/>`__ is installed.
-- :class:`~transformers.integrations.CometCallback` if `comet_ml <https://www.comet.ml/site/>`__ is installed.
-- :class:`~transformers.integrations.MLflowCallback` if `mlflow <https://www.mlflow.org/>`__ is installed.
-- :class:`~transformers.integrations.AzureMLCallback` if `azureml-sdk <https://pypi.org/project/azureml-sdk/>`__ is
-  installed.
-
-The main class that implements callbacks is :class:`~transformers.TrainerCallback`. It gets the
-:class:`~transformers.TrainingArguments` used to instantiate the :class:`~transformers.Trainer`, can access that
-Trainer's internal state via :class:`~transformers.TrainerState`, and can take some actions on the training loop via
-:class:`~transformers.TrainerControl`.
-
-
-Available Callbacks
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Here is the list of the available :class:`~transformers.TrainerCallback` in the library:
-
-.. autoclass:: transformers.integrations.CometCallback
-    :members: setup
-
-.. autoclass:: transformers.DefaultFlowCallback
-
-.. autoclass:: transformers.PrinterCallback
-
-.. autoclass:: transformers.ProgressCallback
-
-.. autoclass:: transformers.EarlyStoppingCallback
-
-.. autoclass:: transformers.integrations.TensorBoardCallback
-
-.. autoclass:: transformers.integrations.WandbCallback
-    :members: setup
-
-.. autoclass:: transformers.integrations.MLflowCallback
-    :members: setup
-
-.. autoclass:: transformers.integrations.AzureMLCallback
-
-TrainerCallback
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TrainerCallback
-    :members:
-
-Here is an example of how to register a custom callback with the PyTorch :class:`~transformers.Trainer`:
-
-.. code-block:: python
-
-    class MyCallback(TrainerCallback):
-        "A callback that prints a message at the beginning of training"
-
-        def on_train_begin(self, args, state, control, **kwargs):
-            print("Starting training")
-
-    trainer = Trainer(
-        model,
-        args,
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-        callbacks=[MyCallback]  # We can either pass the callback class this way or an instance of it (MyCallback())
-    )
-
-Another way to register a callback is to call ``trainer.add_callback()`` as follows:
-
-.. code-block:: python
-
-    trainer = Trainer(...)
-    trainer.add_callback(MyCallback)
-    # Alternatively, we can pass an instance of the callback class
-    trainer.add_callback(MyCallback())
-
-TrainerState
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TrainerState
-    :members:
-
-
-TrainerControl
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TrainerControl
-    :members:
diff --git a/docs/source/main_classes/configuration.mdx b/docs/source/main_classes/configuration.mdx
new file mode 100644
index 0000000000..541781eff7
--- /dev/null
+++ b/docs/source/main_classes/configuration.mdx
@@ -0,0 +1,28 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Configuration
+
+The base class [`PretrainedConfig`] implements the common methods for loading/saving a configuration
+either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded
+from HuggingFace's AWS S3 repository).
+
+Each derived config class implements model specific attributes. Common attributes present in all config classes are:
+`hidden_size`, `num_attention_heads`, and `num_hidden_layers`. Text models further implement:
+`vocab_size`.
+
+
+## PretrainedConfig
+
+[[autodoc]] PretrainedConfig
+    - push_to_hub
+    - all
diff --git a/docs/source/main_classes/configuration.rst b/docs/source/main_classes/configuration.rst
deleted file mode 100644
index bcd665a19a..0000000000
--- a/docs/source/main_classes/configuration.rst
+++ /dev/null
@@ -1,31 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Configuration
------------------------------------------------------------------------------------------------------------------------
-
-The base class :class:`~transformers.PretrainedConfig` implements the common methods for loading/saving a configuration
-either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded
-from HuggingFace's AWS S3 repository).
-
-Each derived config class implements model specific attributes. Common attributes present in all config classes are:
-:obj:`hidden_size`, :obj:`num_attention_heads`, and :obj:`num_hidden_layers`. Text models further implement:
-:obj:`vocab_size`.
-
-
-
-PretrainedConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.PretrainedConfig
-    :special-members: push_to_hub
-    :members:
diff --git a/docs/source/main_classes/data_collator.mdx b/docs/source/main_classes/data_collator.mdx
new file mode 100644
index 0000000000..ee1c1418e4
--- /dev/null
+++ b/docs/source/main_classes/data_collator.mdx
@@ -0,0 +1,64 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Data Collator
+
+Data collators are objects that will form a batch by using a list of dataset elements as input. These elements are of
+the same type as the elements of `train_dataset` or `eval_dataset`.
+
+To be able to build batches, data collators may apply some processing (like padding). Some of them (like
+[`DataCollatorForLanguageModeling`]) also apply some random data augmentation (like random masking)
+on the formed batch.
+
+Examples of use can be found in the [example scripts](../examples) or [example notebooks](../notebooks).
+
+
+## Default data collator
+
+[[autodoc]] data.data_collator.default_data_collator
+
+## DefaultDataCollator
+
+[[autodoc]] data.data_collator.DefaultDataCollator
+
+## DataCollatorWithPadding
+
+[[autodoc]] data.data_collator.DataCollatorWithPadding
+
+## DataCollatorForTokenClassification
+
+[[autodoc]] data.data_collator.DataCollatorForTokenClassification
+
+## DataCollatorForSeq2Seq
+
+[[autodoc]] data.data_collator.DataCollatorForSeq2Seq
+
+## DataCollatorForLanguageModeling
+
+[[autodoc]] data.data_collator.DataCollatorForLanguageModeling
+    - numpy_mask_tokens
+    - tf_mask_tokens
+    - torch_mask_tokens
+
+## DataCollatorForWholeWordMask
+
+[[autodoc]] data.data_collator.DataCollatorForWholeWordMask
+    - numpy_mask_tokens
+    - tf_mask_tokens
+    - torch_mask_tokens
+
+## DataCollatorForPermutationLanguageModeling
+
+[[autodoc]] data.data_collator.DataCollatorForPermutationLanguageModeling
+    - numpy_mask_tokens
+    - tf_mask_tokens
+    - torch_mask_tokens
diff --git a/docs/source/main_classes/data_collator.rst b/docs/source/main_classes/data_collator.rst
deleted file mode 100644
index 4893ebf252..0000000000
--- a/docs/source/main_classes/data_collator.rst
+++ /dev/null
@@ -1,78 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Data Collator
------------------------------------------------------------------------------------------------------------------------
-
-Data collators are objects that will form a batch by using a list of dataset elements as input. These elements are of
-the same type as the elements of :obj:`train_dataset` or :obj:`eval_dataset`.
-
-To be able to build batches, data collators may apply some processing (like padding). Some of them (like
-:class:`~transformers.DataCollatorForLanguageModeling`) also apply some random data augmentation (like random masking)
-on the formed batch.
-
-Examples of use can be found in the :doc:`example scripts <../examples>` or :doc:`example notebooks <../notebooks>`.
-
-
-Default data collator
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: transformers.data.data_collator.default_data_collator
-
-
-DefaultDataCollator
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.data.data_collator.DefaultDataCollator
-    :members:
-
-
-DataCollatorWithPadding
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.data.data_collator.DataCollatorWithPadding
-    :members:
-
-
-DataCollatorForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.data.data_collator.DataCollatorForTokenClassification
-    :members:
-
-
-DataCollatorForSeq2Seq
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.data.data_collator.DataCollatorForSeq2Seq
-    :members:
-
-
-DataCollatorForLanguageModeling
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.data.data_collator.DataCollatorForLanguageModeling
-    :members: numpy_mask_tokens, tf_mask_tokens, torch_mask_tokens
-
-
-DataCollatorForWholeWordMask
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.data.data_collator.DataCollatorForWholeWordMask
-    :members: numpy_mask_tokens, tf_mask_tokens, torch_mask_tokens
-
-
-DataCollatorForPermutationLanguageModeling
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.data.data_collator.DataCollatorForPermutationLanguageModeling
-    :members: numpy_mask_tokens, tf_mask_tokens, torch_mask_tokens
diff --git a/docs/source/main_classes/feature_extractor.mdx b/docs/source/main_classes/feature_extractor.mdx
new file mode 100644
index 0000000000..959c4a001f
--- /dev/null
+++ b/docs/source/main_classes/feature_extractor.mdx
@@ -0,0 +1,38 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Feature Extractor
+
+A feature extractor is in charge of preparing input features for a multi-modal model. This includes feature extraction
+from sequences, *e.g.*, pre-processing audio files to Log-Mel Spectrogram features, feature extraction from images
+*e.g.* cropping image image files, but also padding, normalization, and conversion to Numpy, PyTorch, and TensorFlow
+tensors.
+
+
+## FeatureExtractionMixin
+
+[[autodoc]] feature_extraction_utils.FeatureExtractionMixin
+    - from_pretrained
+    - save_pretrained
+
+## SequenceFeatureExtractor
+
+[[autodoc]] SequenceFeatureExtractor
+    - pad
+
+## BatchFeature
+
+[[autodoc]] BatchFeature
+
+## ImageFeatureExtractionMixin
+
+[[autodoc]] image_utils.ImageFeatureExtractionMixin
diff --git a/docs/source/main_classes/feature_extractor.rst b/docs/source/main_classes/feature_extractor.rst
deleted file mode 100644
index a4577bbccf..0000000000
--- a/docs/source/main_classes/feature_extractor.rst
+++ /dev/null
@@ -1,48 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-
-Feature Extractor
------------------------------------------------------------------------------------------------------------------------
-
-A feature extractor is in charge of preparing input features for a multi-modal model. This includes feature extraction
-from sequences, *e.g.*, pre-processing audio files to Log-Mel Spectrogram features, feature extraction from images
-*e.g.* cropping image image files, but also padding, normalization, and conversion to Numpy, PyTorch, and TensorFlow
-tensors.
-
-
-FeatureExtractionMixin
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.feature_extraction_utils.FeatureExtractionMixin
-    :members: from_pretrained, save_pretrained
-
-
-SequenceFeatureExtractor
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SequenceFeatureExtractor
-    :members: pad
-
-
-BatchFeature
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BatchFeature
-    :members:
-
-
-ImageFeatureExtractionMixin
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.image_utils.ImageFeatureExtractionMixin
-    :members:
diff --git a/docs/source/main_classes/keras_callbacks.mdx b/docs/source/main_classes/keras_callbacks.mdx
new file mode 100644
index 0000000000..cffb6a601b
--- /dev/null
+++ b/docs/source/main_classes/keras_callbacks.mdx
@@ -0,0 +1,20 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Keras callbacks
+
+When training a Transformers model with Keras, there are some library-specific callbacks available to automate common
+tasks:
+
+## PushToHubCallback
+
+[[autodoc]] keras_callbacks.PushToHubCallback
diff --git a/docs/source/main_classes/keras_callbacks.rst b/docs/source/main_classes/keras_callbacks.rst
deleted file mode 100644
index 476802469b..0000000000
--- a/docs/source/main_classes/keras_callbacks.rst
+++ /dev/null
@@ -1,22 +0,0 @@
-..
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Keras callbacks
-=======================================================================================================================
-
-When training a Transformers model with Keras, there are some library-specific callbacks available to automate common
-tasks:
-
-PushToHubCallback
------------------------------------------------------------------------------------------------------------------------
-
-.. autoclass:: transformers.keras_callbacks.PushToHubCallback
diff --git a/docs/source/main_classes/logging.mdx b/docs/source/main_classes/logging.mdx
new file mode 100644
index 0000000000..467fe6d189
--- /dev/null
+++ b/docs/source/main_classes/logging.mdx
@@ -0,0 +1,80 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Logging
+
+🤗 Transformers has a centralized logging system, so that you can setup the verbosity of the library easily.
+
+Currently the default verbosity of the library is `WARNING`.
+
+To change the level of verbosity, just use one of the direct setters. For instance, here is how to change the verbosity
+to the INFO level.
+
+```python
+import transformers
+transformers.logging.set_verbosity_info()
+```
+
+You can also use the environment variable `TRANSFORMERS_VERBOSITY` to override the default verbosity. You can set it
+to one of the following: `debug`, `info`, `warning`, `error`, `critical`. For example:
+
+```bash
+TRANSFORMERS_VERBOSITY=error ./myprogram.py
+```
+
+Additionally, some `warnings` can be disabled by setting the environment variable
+`TRANSFORMERS_NO_ADVISORY_WARNINGS` to a true value, like *1*. This will disable any warning that is logged using
+[`logger.warning_advice`]. For example:
+
+
+```bash
+TRANSFORMERS_NO_ADVISORY_WARNINGS=1 ./myprogram.py
+```
+
+All the methods of this logging module are documented below, the main ones are
+[`logging.get_verbosity`] to get the current level of verbosity in the logger and
+[`logging.set_verbosity`] to set the verbosity to the level of your choice. In order (from the least
+verbose to the most verbose), those levels (with their corresponding int values in parenthesis) are:
+
+- `transformers.logging.CRITICAL` or `transformers.logging.FATAL` (int value, 50): only report the most
+  critical errors.
+- `transformers.logging.ERROR` (int value, 40): only report errors.
+- `transformers.logging.WARNING` or `transformers.logging.WARN` (int value, 30): only reports error and
+  warnings. This the default level used by the library.
+- `transformers.logging.INFO` (int value, 20): reports error, warnings and basic information.
+- `transformers.logging.DEBUG` (int value, 10): report all information.
+
+## Base setters
+
+[[autodoc]] logging.set_verbosity_error
+
+[[autodoc]] logging.set_verbosity_warning
+
+[[autodoc]] logging.set_verbosity_info
+
+[[autodoc]] logging.set_verbosity_debug
+
+## Other functions
+
+[[autodoc]] logging.get_verbosity
+
+[[autodoc]] logging.set_verbosity
+
+[[autodoc]] logging.get_logger
+
+[[autodoc]] logging.enable_default_handler
+
+[[autodoc]] logging.disable_default_handler
+
+[[autodoc]] logging.enable_explicit_format
+
+[[autodoc]] logging.reset_format
diff --git a/docs/source/main_classes/logging.rst b/docs/source/main_classes/logging.rst
deleted file mode 100644
index 9377bb0613..0000000000
--- a/docs/source/main_classes/logging.rst
+++ /dev/null
@@ -1,83 +0,0 @@
-..
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Logging
------------------------------------------------------------------------------------------------------------------------
-
-🤗 Transformers has a centralized logging system, so that you can setup the verbosity of the library easily.
-
-Currently the default verbosity of the library is ``WARNING``.
-
-To change the level of verbosity, just use one of the direct setters. For instance, here is how to change the verbosity
-to the INFO level.
-
-.. code-block:: python
-
-    import transformers
-    transformers.logging.set_verbosity_info()
-
-You can also use the environment variable ``TRANSFORMERS_VERBOSITY`` to override the default verbosity. You can set it
-to one of the following: ``debug``, ``info``, ``warning``, ``error``, ``critical``. For example:
-
-.. code-block:: bash
-
-    TRANSFORMERS_VERBOSITY=error ./myprogram.py
-
-Additionally, some ``warnings`` can be disabled by setting the environment variable
-``TRANSFORMERS_NO_ADVISORY_WARNINGS`` to a true value, like `1`. This will disable any warning that is logged using
-:meth:`logger.warning_advice`. For example:
-
-
-.. code-block:: bash
-
-    TRANSFORMERS_NO_ADVISORY_WARNINGS=1 ./myprogram.py
-
-All the methods of this logging module are documented below, the main ones are
-:func:`transformers.logging.get_verbosity` to get the current level of verbosity in the logger and
-:func:`transformers.logging.set_verbosity` to set the verbosity to the level of your choice. In order (from the least
-verbose to the most verbose), those levels (with their corresponding int values in parenthesis) are:
-
-- :obj:`transformers.logging.CRITICAL` or :obj:`transformers.logging.FATAL` (int value, 50): only report the most
-  critical errors.
-- :obj:`transformers.logging.ERROR` (int value, 40): only report errors.
-- :obj:`transformers.logging.WARNING` or :obj:`transformers.logging.WARN` (int value, 30): only reports error and
-  warnings. This the default level used by the library.
-- :obj:`transformers.logging.INFO` (int value, 20): reports error, warnings and basic information.
-- :obj:`transformers.logging.DEBUG` (int value, 10): report all information.
-
-Base setters
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: transformers.logging.set_verbosity_error
-
-.. autofunction:: transformers.logging.set_verbosity_warning
-
-.. autofunction:: transformers.logging.set_verbosity_info
-
-.. autofunction:: transformers.logging.set_verbosity_debug
-
-Other functions
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: transformers.logging.get_verbosity
-
-.. autofunction:: transformers.logging.set_verbosity
-
-.. autofunction:: transformers.logging.get_logger
-
-.. autofunction:: transformers.logging.enable_default_handler
-
-.. autofunction:: transformers.logging.disable_default_handler
-
-.. autofunction:: transformers.logging.enable_explicit_format
-
-.. autofunction:: transformers.logging.reset_format
diff --git a/docs/source/main_classes/model.mdx b/docs/source/main_classes/model.mdx
new file mode 100644
index 0000000000..d65ae8516e
--- /dev/null
+++ b/docs/source/main_classes/model.mdx
@@ -0,0 +1,99 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Models
+
+The base classes [`PreTrainedModel`], [`TFPreTrainedModel`], and
+[`FlaxPreTrainedModel`] implement the common methods for loading/saving a model either from a local
+file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS
+S3 repository).
+
+[`PreTrainedModel`] and [`TFPreTrainedModel`] also implement a few methods which
+are common among all the models to:
+
+- resize the input token embeddings when new tokens are added to the vocabulary
+- prune the attention heads of the model.
+
+The other methods that are common to each model are defined in [`~modeling_utils.ModuleUtilsMixin`]
+(for the PyTorch models) and [`~modeling_tf_utils.TFModuleUtilsMixin`] (for the TensorFlow models) or
+for text generation, [`~generation_utils.GenerationMixin`] (for the PyTorch models),
+[`~generation_tf_utils.TFGenerationMixin`] (for the TensorFlow models) and
+[`~generation_flax_utils.FlaxGenerationMixin`] (for the Flax/JAX models).
+
+
+## PreTrainedModel
+
+[[autodoc]] PreTrainedModel
+    - push_to_hub
+    - all
+
+<a id='from_pretrained-torch-dtype'></a>
+
+### Model Instantiation dtype
+
+Under Pytorch a model normally gets instantiated with `torch.float32` format. This can be an issue if one tries to
+load a model whose weights are in fp16, since it'd require twice as much memory. To overcome this limitation, you can
+either explicitly pass the desired `dtype` using `torch_dtype` argument:
+
+```python
+model = T5ForConditionalGeneration.from_pretrained("t5", torch_dtype=torch.float16)
+```
+
+or, if you want the model to always load in the most optimal memory pattern, you can use the special value `"auto"`,
+and then `dtype` will be automatically derived from the model's weights:
+
+```python
+model = T5ForConditionalGeneration.from_pretrained("t5", torch_dtype="auto")
+```
+
+Models instantiated from scratch can also be told which `dtype` to use with:
+
+```python
+config = T5Config.from_pretrained("t5")
+model = AutoModel.from_config(config)
+```
+
+Due to Pytorch design, this functionality is only available for floating dtypes.
+
+
+
+## ModuleUtilsMixin
+
+[[autodoc]] modeling_utils.ModuleUtilsMixin
+
+## TFPreTrainedModel
+
+[[autodoc]] TFPreTrainedModel
+    - push_to_hub
+    - all
+
+## TFModelUtilsMixin
+
+[[autodoc]] modeling_tf_utils.TFModelUtilsMixin
+
+## FlaxPreTrainedModel
+
+[[autodoc]] FlaxPreTrainedModel
+    - push_to_hub
+    - all
+
+## Generation
+
+[[autodoc]] generation_utils.GenerationMixin
+
+[[autodoc]] generation_tf_utils.TFGenerationMixin
+
+[[autodoc]] generation_flax_utils.FlaxGenerationMixin
+
+## Pushing to the Hub
+
+[[autodoc]] file_utils.PushToHubMixin
diff --git a/docs/source/main_classes/model.rst b/docs/source/main_classes/model.rst
deleted file mode 100644
index 4633315a28..0000000000
--- a/docs/source/main_classes/model.rst
+++ /dev/null
@@ -1,120 +0,0 @@
-..
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Models
------------------------------------------------------------------------------------------------------------------------
-
-The base classes :class:`~transformers.PreTrainedModel`, :class:`~transformers.TFPreTrainedModel`, and
-:class:`~transformers.FlaxPreTrainedModel` implement the common methods for loading/saving a model either from a local
-file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS
-S3 repository).
-
-:class:`~transformers.PreTrainedModel` and :class:`~transformers.TFPreTrainedModel` also implement a few methods which
-are common among all the models to:
-
-- resize the input token embeddings when new tokens are added to the vocabulary
-- prune the attention heads of the model.
-
-The other methods that are common to each model are defined in :class:`~transformers.modeling_utils.ModuleUtilsMixin`
-(for the PyTorch models) and :class:`~transformers.modeling_tf_utils.TFModuleUtilsMixin` (for the TensorFlow models) or
-for text generation, :class:`~transformers.generation_utils.GenerationMixin` (for the PyTorch models),
-:class:`~transformers.generation_tf_utils.TFGenerationMixin` (for the TensorFlow models) and
-:class:`~transformers.generation_flax_utils.FlaxGenerationMixin` (for the Flax/JAX models).
-
-
-PreTrainedModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.PreTrainedModel
-    :special-members: push_to_hub
-    :members:
-
-
-.. _from_pretrained-torch-dtype:
-
-Model Instantiation dtype
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Under Pytorch a model normally gets instantiated with ``torch.float32`` format. This can be an issue if one tries to
-load a model whose weights are in fp16, since it'd require twice as much memory. To overcome this limitation, you can
-either explicitly pass the desired ``dtype`` using ``torch_dtype`` argument:
-
-.. code-block:: python
-
-    model = T5ForConditionalGeneration.from_pretrained("t5", torch_dtype=torch.float16)
-
-or, if you want the model to always load in the most optimal memory pattern, you can use the special value ``"auto"``,
-and then ``dtype`` will be automatically derived from the model's weights:
-
-.. code-block:: python
-
-    model = T5ForConditionalGeneration.from_pretrained("t5", torch_dtype="auto")
-
-Models instantiated from scratch can also be told which ``dtype`` to use with:
-
-.. code-block:: python
-
-    config = T5Config.from_pretrained("t5")
-    model = AutoModel.from_config(config)
-
-Due to Pytorch design, this functionality is only available for floating dtypes.
-
-
-
-ModuleUtilsMixin
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_utils.ModuleUtilsMixin
-    :members:
-
-
-TFPreTrainedModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFPreTrainedModel
-    :special-members: push_to_hub
-    :members:
-
-
-TFModelUtilsMixin
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_tf_utils.TFModelUtilsMixin
-    :members:
-
-
-FlaxPreTrainedModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxPreTrainedModel
-    :special-members: push_to_hub
-    :members:
-
-
-Generation
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.generation_utils.GenerationMixin
-    :members:
-
-.. autoclass:: transformers.generation_tf_utils.TFGenerationMixin
-    :members:
-
-.. autoclass:: transformers.generation_flax_utils.FlaxGenerationMixin
-    :members:
-
-
-Pushing to the Hub
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.file_utils.PushToHubMixin
-    :members:
diff --git a/docs/source/main_classes/optimizer_schedules.mdx b/docs/source/main_classes/optimizer_schedules.mdx
new file mode 100644
index 0000000000..c842c8d1aa
--- /dev/null
+++ b/docs/source/main_classes/optimizer_schedules.mdx
@@ -0,0 +1,71 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Optimization
+
+The `.optimization` module provides:
+
+- an optimizer with weight decay fixed that can be used to fine-tuned models, and
+- several schedules in the form of schedule objects that inherit from `_LRSchedule`:
+- a gradient accumulation class to accumulate the gradients of multiple batches
+
+## AdamW (PyTorch)
+
+[[autodoc]] AdamW
+
+## AdaFactor (PyTorch)
+
+[[autodoc]] Adafactor
+
+## AdamWeightDecay (TensorFlow)
+
+[[autodoc]] AdamWeightDecay
+
+[[autodoc]] create_optimizer
+
+## Schedules
+
+### Learning Rate Schedules (Pytorch)
+
+[[autodoc]] SchedulerType
+
+[[autodoc]] get_scheduler
+
+[[autodoc]] get_constant_schedule
+
+[[autodoc]] get_constant_schedule_with_warmup
+
+<img alt="" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_constant_schedule.png"/>
+
+[[autodoc]] get_cosine_schedule_with_warmup
+
+<img alt="" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_cosine_schedule.png"/>
+
+[[autodoc]] get_cosine_with_hard_restarts_schedule_with_warmup
+
+<img alt="" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_cosine_hard_restarts_schedule.png"/>
+
+[[autodoc]] get_linear_schedule_with_warmup
+
+<img alt="" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_linear_schedule.png"/>
+
+[[autodoc]] get_polynomial_decay_schedule_with_warmup
+
+### Warmup (TensorFlow)
+
+[[autodoc]] WarmUp
+
+## Gradient Strategies
+
+### GradientAccumulator (TensorFlow)
+
+[[autodoc]] GradientAccumulator
diff --git a/docs/source/main_classes/optimizer_schedules.rst b/docs/source/main_classes/optimizer_schedules.rst
deleted file mode 100644
index 6ac96a8546..0000000000
--- a/docs/source/main_classes/optimizer_schedules.rst
+++ /dev/null
@@ -1,97 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Optimization
------------------------------------------------------------------------------------------------------------------------
-
-The ``.optimization`` module provides:
-
-- an optimizer with weight decay fixed that can be used to fine-tuned models, and
-- several schedules in the form of schedule objects that inherit from ``_LRSchedule``:
-- a gradient accumulation class to accumulate the gradients of multiple batches
-
-AdamW (PyTorch)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.AdamW
-    :members:
-
-AdaFactor (PyTorch)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Adafactor
-
-AdamWeightDecay (TensorFlow)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.AdamWeightDecay
-
-.. autofunction:: transformers.create_optimizer
-
-Schedules
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Learning Rate Schedules (Pytorch)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. autoclass:: transformers.SchedulerType
-
-.. autofunction:: transformers.get_scheduler
-
-.. autofunction:: transformers.get_constant_schedule
-
-
-.. autofunction:: transformers.get_constant_schedule_with_warmup
-
-.. image:: https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_constant_schedule.png
-    :target: https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_constant_schedule.png
-    :alt:
-
-
-.. autofunction:: transformers.get_cosine_schedule_with_warmup
-
-.. image:: https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_cosine_schedule.png
-    :target: https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_cosine_schedule.png
-    :alt:
-
-
-.. autofunction:: transformers.get_cosine_with_hard_restarts_schedule_with_warmup
-
-.. image:: https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_cosine_hard_restarts_schedule.png
-    :target: https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_cosine_hard_restarts_schedule.png
-    :alt:
-
-
-
-.. autofunction:: transformers.get_linear_schedule_with_warmup
-
-.. image:: https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_linear_schedule.png
-    :target: https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_linear_schedule.png
-    :alt:
-
-
-.. autofunction:: transformers.get_polynomial_decay_schedule_with_warmup
-
-
-Warmup (TensorFlow)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. autoclass:: transformers.WarmUp
-    :members:
-
-Gradient Strategies
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-GradientAccumulator (TensorFlow)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. autoclass:: transformers.GradientAccumulator
diff --git a/docs/source/main_classes/output.mdx b/docs/source/main_classes/output.mdx
new file mode 100644
index 0000000000..5d406ac5d5
--- /dev/null
+++ b/docs/source/main_classes/output.mdx
@@ -0,0 +1,269 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Model outputs
+
+All models have outputs that are instances of subclasses of [`~file_utils.ModelOutput`]. Those are
+data structures containing all the information returned by the model, but that can also be used as tuples or
+dictionaries.
+
+Let's see of this looks on an example:
+
+```python
+from transformers import BertTokenizer, BertForSequenceClassification
+import torch
+
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+
+inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+outputs = model(**inputs, labels=labels)
+```
+
+The `outputs` object is a [`~modeling_outputs.SequenceClassifierOutput`], as we can see in the
+documentation of that class below, it means it has an optional `loss`, a `logits` an optional `hidden_states` and
+an optional `attentions` attribute. Here we have the `loss` since we passed along `labels`, but we don't have
+`hidden_states` and `attentions` because we didn't pass `output_hidden_states=True` or
+`output_attentions=True`.
+
+You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you
+will get `None`. Here for instance `outputs.loss` is the loss computed by the model, and `outputs.attentions` is
+`None`.
+
+When considering our `outputs` object as tuple, it only considers the attributes that don't have `None` values.
+Here for instance, it has two elements, `loss` then `logits`, so
+
+```python
+outputs[:2]
+```
+
+will return the tuple `(outputs.loss, outputs.logits)` for instance.
+
+When considering our `outputs` object as dictionary, it only considers the attributes that don't have `None`
+values. Here for instance, it has two keys that are `loss` and `logits`.
+
+We document here the generic model outputs that are used by more than one model type. Specific output types are
+documented on their corresponding model page.
+
+## ModelOutput
+
+[[autodoc]] file_utils.ModelOutput
+    - to_tuple
+
+## BaseModelOutput
+
+[[autodoc]] modeling_outputs.BaseModelOutput
+
+## BaseModelOutputWithPooling
+
+[[autodoc]] modeling_outputs.BaseModelOutputWithPooling
+
+## BaseModelOutputWithCrossAttentions
+
+[[autodoc]] modeling_outputs.BaseModelOutputWithCrossAttentions
+
+## BaseModelOutputWithPoolingAndCrossAttentions
+
+[[autodoc]] modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions
+
+## BaseModelOutputWithPast
+
+[[autodoc]] modeling_outputs.BaseModelOutputWithPast
+
+## BaseModelOutputWithPastAndCrossAttentions
+
+[[autodoc]] modeling_outputs.BaseModelOutputWithPastAndCrossAttentions
+
+## Seq2SeqModelOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqModelOutput
+
+## CausalLMOutput
+
+[[autodoc]] modeling_outputs.CausalLMOutput
+
+## CausalLMOutputWithCrossAttentions
+
+[[autodoc]] modeling_outputs.CausalLMOutputWithCrossAttentions
+
+## CausalLMOutputWithPast
+
+[[autodoc]] modeling_outputs.CausalLMOutputWithPast
+
+## MaskedLMOutput
+
+[[autodoc]] modeling_outputs.MaskedLMOutput
+
+## Seq2SeqLMOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqLMOutput
+
+## NextSentencePredictorOutput
+
+[[autodoc]] modeling_outputs.NextSentencePredictorOutput
+
+## SequenceClassifierOutput
+
+[[autodoc]] modeling_outputs.SequenceClassifierOutput
+
+## Seq2SeqSequenceClassifierOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqSequenceClassifierOutput
+
+## MultipleChoiceModelOutput
+
+[[autodoc]] modeling_outputs.MultipleChoiceModelOutput
+
+## TokenClassifierOutput
+
+[[autodoc]] modeling_outputs.TokenClassifierOutput
+
+## QuestionAnsweringModelOutput
+
+[[autodoc]] modeling_outputs.QuestionAnsweringModelOutput
+
+## Seq2SeqQuestionAnsweringModelOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqQuestionAnsweringModelOutput
+
+## TFBaseModelOutput
+
+[[autodoc]] modeling_tf_outputs.TFBaseModelOutput
+
+## TFBaseModelOutputWithPooling
+
+[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPooling
+
+## TFBaseModelOutputWithPoolingAndCrossAttentions
+
+[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPoolingAndCrossAttentions
+
+## TFBaseModelOutputWithPast
+
+[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPast
+
+## TFBaseModelOutputWithPastAndCrossAttentions
+
+[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPastAndCrossAttentions
+
+## TFSeq2SeqModelOutput
+
+[[autodoc]] modeling_tf_outputs.TFSeq2SeqModelOutput
+
+## TFCausalLMOutput
+
+[[autodoc]] modeling_tf_outputs.TFCausalLMOutput
+
+## TFCausalLMOutputWithCrossAttentions
+
+[[autodoc]] modeling_tf_outputs.TFCausalLMOutputWithCrossAttentions
+
+## TFCausalLMOutputWithPast
+
+[[autodoc]] modeling_tf_outputs.TFCausalLMOutputWithPast
+
+## TFMaskedLMOutput
+
+[[autodoc]] modeling_tf_outputs.TFMaskedLMOutput
+
+## TFSeq2SeqLMOutput
+
+[[autodoc]] modeling_tf_outputs.TFSeq2SeqLMOutput
+
+## TFNextSentencePredictorOutput
+
+[[autodoc]] modeling_tf_outputs.TFNextSentencePredictorOutput
+
+## TFSequenceClassifierOutput
+
+[[autodoc]] modeling_tf_outputs.TFSequenceClassifierOutput
+
+## TFSeq2SeqSequenceClassifierOutput
+
+[[autodoc]] modeling_tf_outputs.TFSeq2SeqSequenceClassifierOutput
+
+## TFMultipleChoiceModelOutput
+
+[[autodoc]] modeling_tf_outputs.TFMultipleChoiceModelOutput
+
+## TFTokenClassifierOutput
+
+[[autodoc]] modeling_tf_outputs.TFTokenClassifierOutput
+
+## TFQuestionAnsweringModelOutput
+
+[[autodoc]] modeling_tf_outputs.TFQuestionAnsweringModelOutput
+
+## TFSeq2SeqQuestionAnsweringModelOutput
+
+[[autodoc]] modeling_tf_outputs.TFSeq2SeqQuestionAnsweringModelOutput
+
+## FlaxBaseModelOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutput
+
+## FlaxBaseModelOutputWithPast
+
+[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPast
+
+## FlaxBaseModelOutputWithPooling
+
+[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPooling
+
+## FlaxBaseModelOutputWithPastAndCrossAttentions
+
+[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPastAndCrossAttentions
+
+## FlaxSeq2SeqModelOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqModelOutput
+
+## FlaxCausalLMOutputWithCrossAttentions
+
+[[autodoc]] modeling_flax_outputs.FlaxCausalLMOutputWithCrossAttentions
+
+## FlaxMaskedLMOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxMaskedLMOutput
+
+## FlaxSeq2SeqLMOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqLMOutput
+
+## FlaxNextSentencePredictorOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxNextSentencePredictorOutput
+
+## FlaxSequenceClassifierOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxSequenceClassifierOutput
+
+## FlaxSeq2SeqSequenceClassifierOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqSequenceClassifierOutput
+
+## FlaxMultipleChoiceModelOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxMultipleChoiceModelOutput
+
+## FlaxTokenClassifierOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxTokenClassifierOutput
+
+## FlaxQuestionAnsweringModelOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxQuestionAnsweringModelOutput
+
+## FlaxSeq2SeqQuestionAnsweringModelOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqQuestionAnsweringModelOutput
diff --git a/docs/source/main_classes/output.rst b/docs/source/main_classes/output.rst
deleted file mode 100644
index be1ecca3cf..0000000000
--- a/docs/source/main_classes/output.rst
+++ /dev/null
@@ -1,412 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Model outputs
------------------------------------------------------------------------------------------------------------------------
-
-All models have outputs that are instances of subclasses of :class:`~transformers.file_utils.ModelOutput`. Those are
-data structures containing all the information returned by the model, but that can also be used as tuples or
-dictionaries.
-
-Let's see of this looks on an example:
-
-.. code-block::
-
-    from transformers import BertTokenizer, BertForSequenceClassification
-    import torch
-
-    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-    model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
-
-    inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-    labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-    outputs = model(**inputs, labels=labels)
-
-The ``outputs`` object is a :class:`~transformers.modeling_outputs.SequenceClassifierOutput`, as we can see in the
-documentation of that class below, it means it has an optional ``loss``, a ``logits`` an optional ``hidden_states`` and
-an optional ``attentions`` attribute. Here we have the ``loss`` since we passed along ``labels``, but we don't have
-``hidden_states`` and ``attentions`` because we didn't pass ``output_hidden_states=True`` or
-``output_attentions=True``.
-
-You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you
-will get ``None``. Here for instance ``outputs.loss`` is the loss computed by the model, and ``outputs.attentions`` is
-``None``.
-
-When considering our ``outputs`` object as tuple, it only considers the attributes that don't have ``None`` values.
-Here for instance, it has two elements, ``loss`` then ``logits``, so
-
-.. code-block::
-
-    outputs[:2]
-
-will return the tuple ``(outputs.loss, outputs.logits)`` for instance.
-
-When considering our ``outputs`` object as dictionary, it only considers the attributes that don't have ``None``
-values. Here for instance, it has two keys that are ``loss`` and ``logits``.
-
-We document here the generic model outputs that are used by more than one model type. Specific output types are
-documented on their corresponding model page.
-
-ModelOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.file_utils.ModelOutput
-    :members: to_tuple
-
-
-BaseModelOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_outputs.BaseModelOutput
-    :members:
-
-
-BaseModelOutputWithPooling
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_outputs.BaseModelOutputWithPooling
-    :members:
-
-
-BaseModelOutputWithCrossAttentions
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_outputs.BaseModelOutputWithCrossAttentions
-    :members:
-
-
-BaseModelOutputWithPoolingAndCrossAttentions
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions
-    :members:
-
-
-BaseModelOutputWithPast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_outputs.BaseModelOutputWithPast
-    :members:
-
-
-BaseModelOutputWithPastAndCrossAttentions
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_outputs.BaseModelOutputWithPastAndCrossAttentions
-    :members:
-
-
-Seq2SeqModelOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_outputs.Seq2SeqModelOutput
-    :members:
-
-
-CausalLMOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_outputs.CausalLMOutput
-    :members:
-
-
-CausalLMOutputWithCrossAttentions
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_outputs.CausalLMOutputWithCrossAttentions
-    :members:
-
-
-CausalLMOutputWithPast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_outputs.CausalLMOutputWithPast
-    :members:
-
-
-MaskedLMOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_outputs.MaskedLMOutput
-    :members:
-
-
-Seq2SeqLMOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_outputs.Seq2SeqLMOutput
-    :members:
-
-
-NextSentencePredictorOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_outputs.NextSentencePredictorOutput
-    :members:
-
-
-SequenceClassifierOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_outputs.SequenceClassifierOutput
-    :members:
-
-
-Seq2SeqSequenceClassifierOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_outputs.Seq2SeqSequenceClassifierOutput
-    :members:
-
-
-MultipleChoiceModelOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_outputs.MultipleChoiceModelOutput
-    :members:
-
-
-TokenClassifierOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_outputs.TokenClassifierOutput
-    :members:
-
-
-QuestionAnsweringModelOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_outputs.QuestionAnsweringModelOutput
-    :members:
-
-
-Seq2SeqQuestionAnsweringModelOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_outputs.Seq2SeqQuestionAnsweringModelOutput
-    :members:
-
-
-TFBaseModelOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_tf_outputs.TFBaseModelOutput
-    :members:
-
-
-TFBaseModelOutputWithPooling
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_tf_outputs.TFBaseModelOutputWithPooling
-    :members:
-
-
-TFBaseModelOutputWithPoolingAndCrossAttentions
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_tf_outputs.TFBaseModelOutputWithPoolingAndCrossAttentions
-    :members:
-
-
-TFBaseModelOutputWithPast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_tf_outputs.TFBaseModelOutputWithPast
-    :members:
-
-
-TFBaseModelOutputWithPastAndCrossAttentions
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_tf_outputs.TFBaseModelOutputWithPastAndCrossAttentions
-    :members:
-
-
-TFSeq2SeqModelOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_tf_outputs.TFSeq2SeqModelOutput
-    :members:
-
-
-TFCausalLMOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_tf_outputs.TFCausalLMOutput
-    :members:
-
-
-TFCausalLMOutputWithCrossAttentions
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_tf_outputs.TFCausalLMOutputWithCrossAttentions
-    :members:
-
-
-TFCausalLMOutputWithPast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_tf_outputs.TFCausalLMOutputWithPast
-    :members:
-
-
-TFMaskedLMOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_tf_outputs.TFMaskedLMOutput
-    :members:
-
-
-TFSeq2SeqLMOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_tf_outputs.TFSeq2SeqLMOutput
-    :members:
-
-
-TFNextSentencePredictorOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_tf_outputs.TFNextSentencePredictorOutput
-    :members:
-
-
-TFSequenceClassifierOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_tf_outputs.TFSequenceClassifierOutput
-    :members:
-
-
-TFSeq2SeqSequenceClassifierOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_tf_outputs.TFSeq2SeqSequenceClassifierOutput
-    :members:
-
-
-TFMultipleChoiceModelOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_tf_outputs.TFMultipleChoiceModelOutput
-    :members:
-
-
-TFTokenClassifierOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_tf_outputs.TFTokenClassifierOutput
-    :members:
-
-
-TFQuestionAnsweringModelOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_tf_outputs.TFQuestionAnsweringModelOutput
-    :members:
-
-
-TFSeq2SeqQuestionAnsweringModelOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_tf_outputs.TFSeq2SeqQuestionAnsweringModelOutput
-    :members:
-
-
-FlaxBaseModelOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_flax_outputs.FlaxBaseModelOutput
-
-
-FlaxBaseModelOutputWithPast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_flax_outputs.FlaxBaseModelOutputWithPast
-
-
-FlaxBaseModelOutputWithPooling
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_flax_outputs.FlaxBaseModelOutputWithPooling
-
-
-FlaxBaseModelOutputWithPastAndCrossAttentions
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_flax_outputs.FlaxBaseModelOutputWithPastAndCrossAttentions
-
-
-FlaxSeq2SeqModelOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_flax_outputs.FlaxSeq2SeqModelOutput
-
-
-FlaxCausalLMOutputWithCrossAttentions
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_flax_outputs.FlaxCausalLMOutputWithCrossAttentions
-
-
-FlaxMaskedLMOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_flax_outputs.FlaxMaskedLMOutput
-
-
-FlaxSeq2SeqLMOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_flax_outputs.FlaxSeq2SeqLMOutput
-
-
-FlaxNextSentencePredictorOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_flax_outputs.FlaxNextSentencePredictorOutput
-
-
-FlaxSequenceClassifierOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_flax_outputs.FlaxSequenceClassifierOutput
-
-
-FlaxSeq2SeqSequenceClassifierOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_flax_outputs.FlaxSeq2SeqSequenceClassifierOutput
-
-
-FlaxMultipleChoiceModelOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_flax_outputs.FlaxMultipleChoiceModelOutput
-
-
-FlaxTokenClassifierOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_flax_outputs.FlaxTokenClassifierOutput
-
-
-FlaxQuestionAnsweringModelOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_flax_outputs.FlaxQuestionAnsweringModelOutput
-
-
-FlaxSeq2SeqQuestionAnsweringModelOutput
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_flax_outputs.FlaxSeq2SeqQuestionAnsweringModelOutput
diff --git a/docs/source/main_classes/pipelines.mdx b/docs/source/main_classes/pipelines.mdx
new file mode 100644
index 0000000000..a955cdef70
--- /dev/null
+++ b/docs/source/main_classes/pipelines.mdx
@@ -0,0 +1,375 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Pipelines
+
+The pipelines are a great and easy way to use models for inference. These pipelines are objects that abstract most of
+the complex code from the library, offering a simple API dedicated to several tasks, including Named Entity
+Recognition, Masked Language Modeling, Sentiment Analysis, Feature Extraction and Question Answering. See the
+[task summary](../task_summary) for examples of use.
+
+There are two categories of pipeline abstractions to be aware about:
+
+- The [`pipeline`] which is the most powerful object encapsulating all other pipelines.
+- The other task-specific pipelines:
+
+  - [`AudioClassificationPipeline`]
+  - [`AutomaticSpeechRecognitionPipeline`]
+  - [`ConversationalPipeline`]
+  - [`FeatureExtractionPipeline`]
+  - [`FillMaskPipeline`]
+  - [`ImageClassificationPipeline`]
+  - [`ImageSegmentationPipeline`]
+  - [`ObjectDetectionPipeline`]
+  - [`QuestionAnsweringPipeline`]
+  - [`SummarizationPipeline`]
+  - [`TableQuestionAnsweringPipeline`]
+  - [`TextClassificationPipeline`]
+  - [`TextGenerationPipeline`]
+  - [`Text2TextGenerationPipeline`]
+  - [`TokenClassificationPipeline`]
+  - [`TranslationPipeline`]
+  - [`ZeroShotClassificationPipeline`]
+
+## The pipeline abstraction
+
+The *pipeline* abstraction is a wrapper around all the other available pipelines. It is instantiated as any other
+pipeline but can provide additional quality of life.
+
+Simple call on one item:
+
+```python
+>>> pipe = pipeline("text-classification")
+>>> pipe("This restaurant is awesome")
+[{'label': 'POSITIVE', 'score': 0.9998743534088135}]
+```
+
+If you want to use a specific model from the [hub](https://huggingface.co) you can ignore the task if the model on
+the hub already defines it:
+
+```python
+>>> pipe = pipeline(model="roberta-large-mnli")
+>>> pipe("This restaurant is awesome")
+[{'label': 'POSITIVE', 'score': 0.9998743534088135}]
+```
+
+To call a pipeline on many items, you can either call with a *list*.
+
+```python
+>>> pipe = pipeline("text-classification")
+>>> pipe(["This restaurant is awesome", "This restaurant is aweful"])
+[{'label': 'POSITIVE', 'score': 0.9998743534088135},
+ {'label': 'NEGATIVE', 'score': 0.9996669292449951}]
+```
+
+To iterate of full datasets it is recommended to use a `dataset` directly. This means you don't need to allocate
+the whole dataset at once, nor do you need to do batching yourself. This should work just as fast as custom loops on
+GPU. If it doesn't don't hesitate to create an issue.
+
+```python
+import datasets
+from transformers import pipeline
+from transformers.pipelines.base import KeyDataset
+import tqdm
+
+pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=0)
+dataset = datasets.load_dataset("superb", name="asr", split="test")
+
+# KeyDataset (only *pt*) will simply return the item in the dict returned by the dataset item
+# as we're not interested in the *target* part of the dataset.
+for out in tqdm.tqdm(pipe(KeyDataset(dataset, "file"))):
+    print(out)
+    # {"text": "NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD NIGHT HUSBAND"}
+    # {"text": ....}
+    # ....
+```
+
+[[autodoc]] pipeline
+
+## Pipeline batching
+
+All pipelines (except *zero-shot-classification* and *question-answering* currently) can use batching. This will work
+whenever the pipeline uses its streaming ability (so when passing lists or `Dataset`).
+
+```python
+from transformers import pipeline                                                   
+from transformers.pipelines.base import KeyDataset
+import datasets
+import tqdm                                                                         
+
+dataset = datasets.load_dataset("imdb", name="plain_text", split="unsupervised")
+pipe = pipeline("text-classification", device=0)
+for out in pipe(KeyDataset(dataset, "text"), batch_size=8, truncation="only_first"):
+    print(out)
+    # [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
+    # Exactly the same output as before, but the content are passed
+    # as batches to the model
+```
+
+<Tip warning={true}>
+
+However, this is not automatically a win for performance. It can be either a 10x speedup or 5x slowdown depending
+on hardware, data and the actual model being used.
+
+Example where it's most a speedup:
+
+</Tip>
+
+```python
+from transformers import pipeline                                                   
+from torch.utils.data import Dataset                                                
+import tqdm                                                                         
+
+
+pipe = pipeline("text-classification", device=0)                                    
+
+
+class MyDataset(Dataset):                                                           
+    def __len__(self):                                                              
+        return 5000                                                                 
+
+    def __getitem__(self, i):                                                       
+        return "This is a test"                                                     
+
+
+dataset = MyDataset()   
+
+for batch_size in [1, 8, 64, 256]:
+    print("-" * 30)                                                                     
+    print(f"Streaming batch_size={batch_size}")    
+    for out in tqdm.tqdm(pipe(dataset, batch_size=batch_size), total=len(dataset)):              
+        pass
+```
+
+```
+# On GTX 970
+------------------------------
+Streaming no batching
+100%|██████████████████████████████████████████████████████████████████████| 5000/5000 [00:26<00:00, 187.52it/s]
+------------------------------
+Streaming batch_size=8
+100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:04<00:00, 1205.95it/s]
+------------------------------
+Streaming batch_size=64
+100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:02<00:00, 2478.24it/s]
+------------------------------
+Streaming batch_size=256
+100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:01<00:00, 2554.43it/s]
+(diminishing returns, saturated the GPU)
+```
+
+Example where it's most a slowdown:
+
+```python
+class MyDataset(Dataset):                                                           
+    def __len__(self):                                                              
+        return 5000                                                                 
+
+    def __getitem__(self, i):                                                       
+        if i % 64 == 0:                                                          
+            n = 100                                                              
+        else:                                                                    
+            n = 1                                                                
+        return "This is a test" * n
+```
+
+This is a occasional very long sentence compared to the other. In that case, the **whole** batch will need to be 400
+tokens long, so the whole batch will be [64, 400] instead of [64, 4], leading to the high slowdown. Even worse, on
+bigger batches, the program simply crashes.
+
+
+```
+------------------------------
+Streaming no batching
+100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:05<00:00, 183.69it/s]
+------------------------------
+Streaming batch_size=8
+100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:03<00:00, 265.74it/s]
+------------------------------
+Streaming batch_size=64
+100%|██████████████████████████████████████████████████████████████████████| 1000/1000 [00:26<00:00, 37.80it/s]
+------------------------------
+Streaming batch_size=256
+  0%|                                                                                 | 0/1000 [00:00<?, ?it/s]
+Traceback (most recent call last):
+  File "/home/nicolas/src/transformers/test.py", line 42, in <module>
+    for out in tqdm.tqdm(pipe(dataset, batch_size=256), total=len(dataset)):
+....
+    q = q / math.sqrt(dim_per_head)  # (bs, n_heads, q_length, dim_per_head)
+RuntimeError: CUDA out of memory. Tried to allocate 376.00 MiB (GPU 0; 3.95 GiB total capacity; 1.72 GiB already allocated; 354.88 MiB free; 2.46 GiB reserved in total by PyTorch)
+```
+
+There are no good (general) solutions for this problem, and your mileage may vary depending on your use cases. Rule of
+thumb:
+
+For users, a rule of thumb is:
+
+- **Measure performance on your load, with your hardware. Measure, measure, and keep measuring. Real numbers are the
+  only way to go.**
+- If you are latency constrained (live product doing inference), don't batch
+- If you are using CPU, don't batch.
+- If you are using throughput (you want to run your model on a bunch of static data), on GPU, then:
+
+  - If you have no clue about the size of the sequence_length ("natural" data), by default don't batch, measure and
+    try tentatively to add it, add OOM checks to recover when it will fail (and it will at some point if you don't
+    control the sequence_length.)
+  - If your sequence_length is super regular, then batching is more likely to be VERY interesting, measure and push
+    it until you get OOMs.
+  - The larger the GPU the more likely batching is going to be more interesting
+- As soon as you enable batching, make sure you can handle OOMs nicely.
+
+## Pipeline custom code
+
+If you want to override a specific pipeline.
+
+Don't hesitate to create an issue for your task at hand, the goal of the pipeline is to be easy to use and support most
+cases, so `transformers` could maybe support your use case.
+
+
+If you want to try simply you can:
+
+- Subclass your pipeline of choice
+
+```python
+class MyPipeline(TextClassificationPipeline):
+    def postprocess(...):
+        ...
+        scores = scores * 100
+        ...
+
+my_pipeline = MyPipeline(model=model, tokenizer=tokenizer, ...)
+# or if you use *pipeline* function, then:
+my_pipeline = pipeline(model="xxxx", pipeline_class=MyPipeline)
+```
+
+That should enable you to do all the custom code you want.
+
+
+## Implementing a pipeline
+
+[Implementing a new pipeline](../add_new_pipeline)
+
+## The task specific pipelines
+
+
+### AudioClassificationPipeline
+
+[[autodoc]] AudioClassificationPipeline
+    - __call__
+    - all
+
+### AutomaticSpeechRecognitionPipeline
+
+[[autodoc]] AutomaticSpeechRecognitionPipeline
+    - __call__
+    - all
+
+### ConversationalPipeline
+
+[[autodoc]] Conversation
+
+[[autodoc]] ConversationalPipeline
+    - __call__
+    - all
+
+### FeatureExtractionPipeline
+
+[[autodoc]] FeatureExtractionPipeline
+    - __call__
+    - all
+
+### FillMaskPipeline
+
+[[autodoc]] FillMaskPipeline
+    - __call__
+    - all
+
+### ImageClassificationPipeline
+
+[[autodoc]] ImageClassificationPipeline
+    - __call__
+    - all
+
+### ImageSegmentationPipeline
+
+[[autodoc]] ImageSegmentationPipeline
+    - __call__
+    - all
+
+### NerPipeline
+
+[[autodoc]] NerPipeline
+
+See [`TokenClassificationPipeline`] for all details.
+
+### ObjectDetectionPipeline
+
+[[autodoc]] ObjectDetectionPipeline
+    - __call__
+    - all
+
+### QuestionAnsweringPipeline
+
+[[autodoc]] QuestionAnsweringPipeline
+    - __call__
+    - all
+
+### SummarizationPipeline
+
+[[autodoc]] SummarizationPipeline
+    - __call__
+    - all
+
+### TableQuestionAnsweringPipeline
+
+[[autodoc]] TableQuestionAnsweringPipeline
+    - __call__
+
+### TextClassificationPipeline
+
+[[autodoc]] TextClassificationPipeline
+    - __call__
+    - all
+
+### TextGenerationPipeline
+
+[[autodoc]] TextGenerationPipeline
+    - __call__
+    - all
+
+### Text2TextGenerationPipeline
+
+[[autodoc]] Text2TextGenerationPipeline
+    - __call__
+    - all
+
+### TokenClassificationPipeline
+
+[[autodoc]] TokenClassificationPipeline
+    - __call__
+    - all
+
+### TranslationPipeline
+
+[[autodoc]] TranslationPipeline
+    - __call__
+    - all
+
+### ZeroShotClassificationPipeline
+
+[[autodoc]] ZeroShotClassificationPipeline
+    - __call__
+    - all
+
+## Parent class: `Pipeline`
+
+[[autodoc]] Pipeline
diff --git a/docs/source/main_classes/pipelines.rst b/docs/source/main_classes/pipelines.rst
deleted file mode 100644
index bd78e9e63a..0000000000
--- a/docs/source/main_classes/pipelines.rst
+++ /dev/null
@@ -1,407 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Pipelines
------------------------------------------------------------------------------------------------------------------------
-
-The pipelines are a great and easy way to use models for inference. These pipelines are objects that abstract most of
-the complex code from the library, offering a simple API dedicated to several tasks, including Named Entity
-Recognition, Masked Language Modeling, Sentiment Analysis, Feature Extraction and Question Answering. See the
-:doc:`task summary <../task_summary>` for examples of use.
-
-There are two categories of pipeline abstractions to be aware about:
-
-- The :func:`~transformers.pipeline` which is the most powerful object encapsulating all other pipelines.
-- The other task-specific pipelines:
-
-    - :class:`~transformers.AudioClassificationPipeline`
-    - :class:`~transformers.AutomaticSpeechRecognitionPipeline`
-    - :class:`~transformers.ConversationalPipeline`
-    - :class:`~transformers.FeatureExtractionPipeline`
-    - :class:`~transformers.FillMaskPipeline`
-    - :class:`~transformers.ImageClassificationPipeline`
-    - :class:`~transformers.ImageSegmentationPipeline`
-    - :class:`~transformers.ObjectDetectionPipeline`
-    - :class:`~transformers.QuestionAnsweringPipeline`
-    - :class:`~transformers.SummarizationPipeline`
-    - :class:`~transformers.TableQuestionAnsweringPipeline`
-    - :class:`~transformers.TextClassificationPipeline`
-    - :class:`~transformers.TextGenerationPipeline`
-    - :class:`~transformers.Text2TextGenerationPipeline`
-    - :class:`~transformers.TokenClassificationPipeline`
-    - :class:`~transformers.TranslationPipeline`
-    - :class:`~transformers.ZeroShotClassificationPipeline`
-
-The pipeline abstraction
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The `pipeline` abstraction is a wrapper around all the other available pipelines. It is instantiated as any other
-pipeline but can provide additional quality of life.
-
-Simple call on one item:
-
-.. code-block::
-
-    >>> pipe = pipeline("text-classification")
-    >>> pipe("This restaurant is awesome")
-    [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
-
-If you want to use a specific model from the `hub <https://huggingface.co>`__ you can ignore the task if the model on
-the hub already defines it:
-
-.. code-block::
-
-    >>> pipe = pipeline(model="roberta-large-mnli")
-    >>> pipe("This restaurant is awesome")
-    [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
-
-To call a pipeline on many items, you can either call with a `list`.
-
-.. code-block::
-
-    >>> pipe = pipeline("text-classification")
-    >>> pipe(["This restaurant is awesome", "This restaurant is aweful"])
-    [{'label': 'POSITIVE', 'score': 0.9998743534088135},
-     {'label': 'NEGATIVE', 'score': 0.9996669292449951}]
-
-
-To iterate of full datasets it is recommended to use a :obj:`dataset` directly. This means you don't need to allocate
-the whole dataset at once, nor do you need to do batching yourself. This should work just as fast as custom loops on
-GPU. If it doesn't don't hesitate to create an issue.
-
-.. code-block::
-
-    import datasets
-    from transformers import pipeline
-    from transformers.pipelines.base import KeyDataset
-    import tqdm
-
-    pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=0)
-    dataset = datasets.load_dataset("superb", name="asr", split="test")
-
-    # KeyDataset (only `pt`) will simply return the item in the dict returned by the dataset item
-    # as we're not interested in the `target` part of the dataset.
-    for out in tqdm.tqdm(pipe(KeyDataset(dataset, "file"))):
-        print(out)
-        # {"text": "NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD NIGHT HUSBAND"}
-        # {"text": ....}
-        # ....
-
-
-.. autofunction:: transformers.pipeline
-
-Pipeline batching
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-All pipelines (except `zero-shot-classification` and `question-answering` currently) can use batching. This will work
-whenever the pipeline uses its streaming ability (so when passing lists or :obj:`Dataset`).
-
-.. code-block::
-
-    from transformers import pipeline                                                   
-    from transformers.pipelines.base import KeyDataset
-    import datasets
-    import tqdm                                                                         
-
-    dataset = datasets.load_dataset("imdb", name="plain_text", split="unsupervised")
-    pipe = pipeline("text-classification", device=0)
-    for out in pipe(KeyDataset(dataset, "text"), batch_size=8, truncation="only_first"):
-        print(out)
-        # [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
-        # Exactly the same output as before, but the content are passed
-        # as batches to the model
-
-
-.. warning::
-
-    However, this is not automatically a win for performance. It can be either a 10x speedup or 5x slowdown depending
-    on hardware, data and the actual model being used.
-
-    Example where it's most a speedup:
-
-
-.. code-block::
-
-    from transformers import pipeline                                                   
-    from torch.utils.data import Dataset                                                
-    import tqdm                                                                         
-
-
-    pipe = pipeline("text-classification", device=0)                                    
-
-
-    class MyDataset(Dataset):                                                           
-        def __len__(self):                                                              
-            return 5000                                                                 
-
-        def __getitem__(self, i):                                                       
-            return "This is a test"                                                     
-
-
-    dataset = MyDataset()   
-
-    for batch_size in [1, 8, 64, 256]:
-        print("-" * 30)                                                                     
-        print(f"Streaming batch_size={batch_size}")    
-        for out in tqdm.tqdm(pipe(dataset, batch_size=batch_size), total=len(dataset)):              
-            pass
-
-
-.. code-block::
-
-    # On GTX 970
-    ------------------------------
-    Streaming no batching
-    100%|██████████████████████████████████████████████████████████████████████| 5000/5000 [00:26<00:00, 187.52it/s]
-    ------------------------------
-    Streaming batch_size=8
-    100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:04<00:00, 1205.95it/s]
-    ------------------------------
-    Streaming batch_size=64
-    100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:02<00:00, 2478.24it/s]
-    ------------------------------
-    Streaming batch_size=256
-    100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:01<00:00, 2554.43it/s]
-    (diminishing returns, saturated the GPU)
-
-
-Example where it's most a slowdown:
-
-.. code-block::
-
-    class MyDataset(Dataset):                                                           
-        def __len__(self):                                                              
-            return 5000                                                                 
-
-        def __getitem__(self, i):                                                       
-            if i % 64 == 0:                                                          
-                n = 100                                                              
-            else:                                                                    
-                n = 1                                                                
-            return "This is a test" * n
-
-This is a occasional very long sentence compared to the other. In that case, the **whole** batch will need to be 400
-tokens long, so the whole batch will be [64, 400] instead of [64, 4], leading to the high slowdown. Even worse, on
-bigger batches, the program simply crashes.
-
-
-.. code-block::
-
-    ------------------------------
-    Streaming no batching
-    100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:05<00:00, 183.69it/s]
-    ------------------------------
-    Streaming batch_size=8
-    100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:03<00:00, 265.74it/s]
-    ------------------------------
-    Streaming batch_size=64
-    100%|██████████████████████████████████████████████████████████████████████| 1000/1000 [00:26<00:00, 37.80it/s]
-    ------------------------------
-    Streaming batch_size=256
-      0%|                                                                                 | 0/1000 [00:00<?, ?it/s]
-    Traceback (most recent call last):
-      File "/home/nicolas/src/transformers/test.py", line 42, in <module>
-        for out in tqdm.tqdm(pipe(dataset, batch_size=256), total=len(dataset)):
-    ....
-        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, q_length, dim_per_head)
-    RuntimeError: CUDA out of memory. Tried to allocate 376.00 MiB (GPU 0; 3.95 GiB total capacity; 1.72 GiB already allocated; 354.88 MiB free; 2.46 GiB reserved in total by PyTorch)
-
-
-There are no good (general) solutions for this problem, and your mileage may vary depending on your use cases. Rule of
-thumb:
-
-For users, a rule of thumb is:
-
-- **Measure performance on your load, with your hardware. Measure, measure, and keep measuring. Real numbers are the
-  only way to go.**
-- If you are latency constrained (live product doing inference), don't batch
-- If you are using CPU, don't batch.
-- If you are using throughput (you want to run your model on a bunch of static data), on GPU, then:
-
-      - If you have no clue about the size of the sequence_length ("natural" data), by default don't batch, measure and
-        try tentatively to add it, add OOM checks to recover when it will fail (and it will at some point if you don't
-        control the sequence_length.)
-      - If your sequence_length is super regular, then batching is more likely to be VERY interesting, measure and push
-        it until you get OOMs.
-      - The larger the GPU the more likely batching is going to be more interesting
-- As soon as you enable batching, make sure you can handle OOMs nicely.
-
-Pipeline custom code
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-If you want to override a specific pipeline.
-
-Don't hesitate to create an issue for your task at hand, the goal of the pipeline is to be easy to use and support most
-cases, so :obj:`transformers` could maybe support your use case.
-
-
-If you want to try simply you can:
-
-- Subclass your pipeline of choice
-
-.. code-block::
-
-    class MyPipeline(TextClassificationPipeline):
-        def postprocess(...):
-            ...
-            scores = scores * 100
-            ...
-
-    my_pipeline = MyPipeline(model=model, tokenizer=tokenizer, ...)
-    # or if you use `pipeline` function, then:
-    my_pipeline = pipeline(model="xxxx", pipeline_class=MyPipeline)
-
-That should enable you to do all the custom code you want.
-
-
-Implementing a pipeline
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-:doc:`Implementing a new pipeline <../add_new_pipeline>`
-
-The task specific pipelines
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-
-AudioClassificationPipeline
-=======================================================================================================================
-
-.. autoclass:: transformers.AudioClassificationPipeline
-    :special-members: __call__
-    :members:
-
-AutomaticSpeechRecognitionPipeline
-=======================================================================================================================
-
-.. autoclass:: transformers.AutomaticSpeechRecognitionPipeline
-    :special-members: __call__
-    :members:
-
-ConversationalPipeline
-=======================================================================================================================
-
-.. autoclass:: transformers.Conversation
-
-.. autoclass:: transformers.ConversationalPipeline
-    :special-members: __call__
-    :members:
-
-FeatureExtractionPipeline
-=======================================================================================================================
-
-.. autoclass:: transformers.FeatureExtractionPipeline
-    :special-members: __call__
-    :members:
-
-FillMaskPipeline
-=======================================================================================================================
-
-.. autoclass:: transformers.FillMaskPipeline
-    :special-members: __call__
-    :members:
-
-ImageClassificationPipeline
-=======================================================================================================================
-
-.. autoclass:: transformers.ImageClassificationPipeline
-    :special-members: __call__
-    :members:
-
-ImageSegmentationPipeline
-=======================================================================================================================
-
-.. autoclass:: transformers.ImageSegmentationPipeline
-    :special-members: __call__
-    :members:
-
-NerPipeline
-=======================================================================================================================
-
-.. autoclass:: transformers.NerPipeline
-
-See :class:`~transformers.TokenClassificationPipeline` for all details.
-
-ObjectDetectionPipeline
-=======================================================================================================================
-
-.. autoclass:: transformers.ObjectDetectionPipeline
-    :special-members: __call__
-    :members:
-
-QuestionAnsweringPipeline
-=======================================================================================================================
-
-.. autoclass:: transformers.QuestionAnsweringPipeline
-    :special-members: __call__
-    :members:
-
-SummarizationPipeline
-=======================================================================================================================
-
-.. autoclass:: transformers.SummarizationPipeline
-    :special-members: __call__
-    :members:
-
-TableQuestionAnsweringPipeline
-=======================================================================================================================
-
-.. autoclass:: transformers.TableQuestionAnsweringPipeline
-    :special-members: __call__
-
-
-TextClassificationPipeline
-=======================================================================================================================
-
-.. autoclass:: transformers.TextClassificationPipeline
-    :special-members: __call__
-    :members:
-
-TextGenerationPipeline
-=======================================================================================================================
-
-.. autoclass:: transformers.TextGenerationPipeline
-    :special-members: __call__
-    :members:
-
-Text2TextGenerationPipeline
-=======================================================================================================================
-
-.. autoclass:: transformers.Text2TextGenerationPipeline
-    :special-members: __call__
-    :members:
-
-TokenClassificationPipeline
-=======================================================================================================================
-
-.. autoclass:: transformers.TokenClassificationPipeline
-    :special-members: __call__
-    :members:
-
-TranslationPipeline
-=======================================================================================================================
-
-.. autoclass:: transformers.TranslationPipeline
-    :special-members: __call__
-    :members:
-
-ZeroShotClassificationPipeline
-=======================================================================================================================
-
-.. autoclass:: transformers.ZeroShotClassificationPipeline
-    :special-members: __call__
-    :members:
-
-Parent class: :obj:`Pipeline`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Pipeline
-    :members:
diff --git a/docs/source/main_classes/processors.mdx b/docs/source/main_classes/processors.mdx
new file mode 100644
index 0000000000..9ed295378c
--- /dev/null
+++ b/docs/source/main_classes/processors.mdx
@@ -0,0 +1,152 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Processors
+
+This library includes processors for several traditional tasks. These processors can be used to process a dataset into
+examples that can be fed to a model.
+
+## Processors
+
+All processors follow the same architecture which is that of the
+[`~data.processors.utils.DataProcessor`]. The processor returns a list of
+[`~data.processors.utils.InputExample`]. These
+[`~data.processors.utils.InputExample`] can be converted to
+[`~data.processors.utils.InputFeatures`] in order to be fed to the model.
+
+[[autodoc]] data.processors.utils.DataProcessor
+
+[[autodoc]] data.processors.utils.InputExample
+
+[[autodoc]] data.processors.utils.InputFeatures
+
+## GLUE
+
+[General Language Understanding Evaluation (GLUE)](https://gluebenchmark.com/) is a benchmark that evaluates the
+performance of models across a diverse set of existing NLU tasks. It was released together with the paper [GLUE: A
+multi-task benchmark and analysis platform for natural language understanding](https://openreview.net/pdf?id=rJ4km2R5t7)
+
+This library hosts a total of 10 processors for the following tasks: MRPC, MNLI, MNLI (mismatched), CoLA, SST2, STSB,
+QQP, QNLI, RTE and WNLI.
+
+Those processors are:
+
+- [`~data.processors.utils.MrpcProcessor`]
+- [`~data.processors.utils.MnliProcessor`]
+- [`~data.processors.utils.MnliMismatchedProcessor`]
+- [`~data.processors.utils.Sst2Processor`]
+- [`~data.processors.utils.StsbProcessor`]
+- [`~data.processors.utils.QqpProcessor`]
+- [`~data.processors.utils.QnliProcessor`]
+- [`~data.processors.utils.RteProcessor`]
+- [`~data.processors.utils.WnliProcessor`]
+
+Additionally, the following method can be used to load values from a data file and convert them to a list of
+[`~data.processors.utils.InputExample`].
+
+automethod,transformers.data.processors.glue.glue_convert_examples_to_features
+
+
+### Example usage
+
+An example using these processors is given in the [run_glue.py](https://github.com/huggingface/transformers/tree/master/examples/legacy/text-classification/run_glue.py) script.
+
+
+## XNLI
+
+[The Cross-Lingual NLI Corpus (XNLI)](https://www.nyu.edu/projects/bowman/xnli/) is a benchmark that evaluates the
+quality of cross-lingual text representations. XNLI is crowd-sourced dataset based on [*MultiNLI*](http://www.nyu.edu/projects/bowman/multinli/): pairs of text are labeled with textual entailment annotations for 15
+different languages (including both high-resource language such as English and low-resource languages such as Swahili).
+
+It was released together with the paper [XNLI: Evaluating Cross-lingual Sentence Representations](https://arxiv.org/abs/1809.05053)
+
+This library hosts the processor to load the XNLI data:
+
+- [`~data.processors.utils.XnliProcessor`]
+
+Please note that since the gold labels are available on the test set, evaluation is performed on the test set.
+
+An example using these processors is given in the [run_xnli.py](https://github.com/huggingface/transformers/tree/master/examples/legacy/text-classification/run_xnli.py) script.
+
+
+## SQuAD
+
+[The Stanford Question Answering Dataset (SQuAD)](https://rajpurkar.github.io/SQuAD-explorer//) is a benchmark that
+evaluates the performance of models on question answering. Two versions are available, v1.1 and v2.0. The first version
+(v1.1) was released together with the paper [SQuAD: 100,000+ Questions for Machine Comprehension of Text](https://arxiv.org/abs/1606.05250). The second version (v2.0) was released alongside the paper [Know What You Don't
+Know: Unanswerable Questions for SQuAD](https://arxiv.org/abs/1806.03822).
+
+This library hosts a processor for each of the two versions:
+
+### Processors
+
+Those processors are:
+
+- [`~data.processors.utils.SquadV1Processor`]
+- [`~data.processors.utils.SquadV2Processor`]
+
+They both inherit from the abstract class [`~data.processors.utils.SquadProcessor`]
+
+[[autodoc]] data.processors.squad.SquadProcessor
+    - all
+
+Additionally, the following method can be used to convert SQuAD examples into
+[`~data.processors.utils.SquadFeatures`] that can be used as model inputs.
+
+automethod,transformers.data.processors.squad.squad_convert_examples_to_features
+
+
+These processors as well as the aforementionned method can be used with files containing the data as well as with the
+*tensorflow_datasets* package. Examples are given below.
+
+
+### Example usage
+
+Here is an example using the processors as well as the conversion method using data files:
+
+```python
+# Loading a V2 processor
+processor = SquadV2Processor()
+examples = processor.get_dev_examples(squad_v2_data_dir)
+
+# Loading a V1 processor
+processor = SquadV1Processor()
+examples = processor.get_dev_examples(squad_v1_data_dir)
+
+features = squad_convert_examples_to_features( 
+    examples=examples,
+    tokenizer=tokenizer,
+    max_seq_length=max_seq_length,
+    doc_stride=args.doc_stride,
+    max_query_length=max_query_length,
+    is_training=not evaluate,
+)
+```
+
+Using *tensorflow_datasets* is as easy as using a data file:
+
+```python
+# tensorflow_datasets only handle Squad V1.
+tfds_examples = tfds.load("squad")
+examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
+
+features = squad_convert_examples_to_features( 
+    examples=examples,
+    tokenizer=tokenizer,
+    max_seq_length=max_seq_length,
+    doc_stride=args.doc_stride,
+    max_query_length=max_query_length,
+    is_training=not evaluate,
+)
+```
+
+Another example using these processors is given in the [run_squad.py](https://github.com/huggingface/transformers/tree/master/examples/legacy/question-answering/run_squad.py) script.
diff --git a/docs/source/main_classes/processors.rst b/docs/source/main_classes/processors.rst
deleted file mode 100644
index b7e70bc655..0000000000
--- a/docs/source/main_classes/processors.rst
+++ /dev/null
@@ -1,172 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Processors
------------------------------------------------------------------------------------------------------------------------
-
-This library includes processors for several traditional tasks. These processors can be used to process a dataset into
-examples that can be fed to a model.
-
-Processors
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-All processors follow the same architecture which is that of the
-:class:`~transformers.data.processors.utils.DataProcessor`. The processor returns a list of
-:class:`~transformers.data.processors.utils.InputExample`. These
-:class:`~transformers.data.processors.utils.InputExample` can be converted to
-:class:`~transformers.data.processors.utils.InputFeatures` in order to be fed to the model.
-
-.. autoclass:: transformers.data.processors.utils.DataProcessor
-    :members:
-
-
-.. autoclass:: transformers.data.processors.utils.InputExample
-    :members:
-
-
-.. autoclass:: transformers.data.processors.utils.InputFeatures
-    :members:
-
-
-GLUE
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-`General Language Understanding Evaluation (GLUE) <https://gluebenchmark.com/>`__ is a benchmark that evaluates the
-performance of models across a diverse set of existing NLU tasks. It was released together with the paper `GLUE: A
-multi-task benchmark and analysis platform for natural language understanding
-<https://openreview.net/pdf?id=rJ4km2R5t7>`__
-
-This library hosts a total of 10 processors for the following tasks: MRPC, MNLI, MNLI (mismatched), CoLA, SST2, STSB,
-QQP, QNLI, RTE and WNLI.
-
-Those processors are:
-
-    - :class:`~transformers.data.processors.utils.MrpcProcessor`
-    - :class:`~transformers.data.processors.utils.MnliProcessor`
-    - :class:`~transformers.data.processors.utils.MnliMismatchedProcessor`
-    - :class:`~transformers.data.processors.utils.Sst2Processor`
-    - :class:`~transformers.data.processors.utils.StsbProcessor`
-    - :class:`~transformers.data.processors.utils.QqpProcessor`
-    - :class:`~transformers.data.processors.utils.QnliProcessor`
-    - :class:`~transformers.data.processors.utils.RteProcessor`
-    - :class:`~transformers.data.processors.utils.WnliProcessor`
-
-Additionally, the following method can be used to load values from a data file and convert them to a list of
-:class:`~transformers.data.processors.utils.InputExample`.
-
-.. automethod:: transformers.data.processors.glue.glue_convert_examples_to_features
-
-Example usage
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-An example using these processors is given in the :prefix_link:`run_glue.py
-<examples/legacy/text-classification/run_glue.py>` script.
-
-
-XNLI
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-`The Cross-Lingual NLI Corpus (XNLI) <https://www.nyu.edu/projects/bowman/xnli/>`__ is a benchmark that evaluates the
-quality of cross-lingual text representations. XNLI is crowd-sourced dataset based on `MultiNLI
-<http://www.nyu.edu/projects/bowman/multinli/>`: pairs of text are labeled with textual entailment annotations for 15
-different languages (including both high-resource language such as English and low-resource languages such as Swahili).
-
-It was released together with the paper `XNLI: Evaluating Cross-lingual Sentence Representations
-<https://arxiv.org/abs/1809.05053>`__
-
-This library hosts the processor to load the XNLI data:
-
-    - :class:`~transformers.data.processors.utils.XnliProcessor`
-
-Please note that since the gold labels are available on the test set, evaluation is performed on the test set.
-
-An example using these processors is given in the :prefix_link:`run_xnli.py
-<examples/legacy/text-classification/run_xnli.py>` script.
-
-
-SQuAD
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-`The Stanford Question Answering Dataset (SQuAD) <https://rajpurkar.github.io/SQuAD-explorer//>`__ is a benchmark that
-evaluates the performance of models on question answering. Two versions are available, v1.1 and v2.0. The first version
-(v1.1) was released together with the paper `SQuAD: 100,000+ Questions for Machine Comprehension of Text
-<https://arxiv.org/abs/1606.05250>`__. The second version (v2.0) was released alongside the paper `Know What You Don't
-Know: Unanswerable Questions for SQuAD <https://arxiv.org/abs/1806.03822>`__.
-
-This library hosts a processor for each of the two versions:
-
-Processors
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Those processors are:
-
-    - :class:`~transformers.data.processors.utils.SquadV1Processor`
-    - :class:`~transformers.data.processors.utils.SquadV2Processor`
-
-They both inherit from the abstract class :class:`~transformers.data.processors.utils.SquadProcessor`
-
-.. autoclass:: transformers.data.processors.squad.SquadProcessor
-    :members:
-
-Additionally, the following method can be used to convert SQuAD examples into
-:class:`~transformers.data.processors.utils.SquadFeatures` that can be used as model inputs.
-
-.. automethod:: transformers.data.processors.squad.squad_convert_examples_to_features
-
-These processors as well as the aforementionned method can be used with files containing the data as well as with the
-`tensorflow_datasets` package. Examples are given below.
-
-
-Example usage
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Here is an example using the processors as well as the conversion method using data files:
-
-.. code-block::
-
-    # Loading a V2 processor
-    processor = SquadV2Processor()
-    examples = processor.get_dev_examples(squad_v2_data_dir)
-
-    # Loading a V1 processor
-    processor = SquadV1Processor()
-    examples = processor.get_dev_examples(squad_v1_data_dir)
-
-    features = squad_convert_examples_to_features( 
-        examples=examples,
-        tokenizer=tokenizer,
-        max_seq_length=max_seq_length,
-        doc_stride=args.doc_stride,
-        max_query_length=max_query_length,
-        is_training=not evaluate,
-    )
-
-Using `tensorflow_datasets` is as easy as using a data file:
-
-.. code-block::
-
-    # tensorflow_datasets only handle Squad V1.
-    tfds_examples = tfds.load("squad")
-    examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
-
-    features = squad_convert_examples_to_features( 
-        examples=examples,
-        tokenizer=tokenizer,
-        max_seq_length=max_seq_length,
-        doc_stride=args.doc_stride,
-        max_query_length=max_query_length,
-        is_training=not evaluate,
-    )
-
-
-Another example using these processors is given in the :prefix_link:`run_squad.py
-<examples/legacy/question-answering/run_squad.py>` script.
diff --git a/docs/source/main_classes/tokenizer.mdx b/docs/source/main_classes/tokenizer.mdx
new file mode 100644
index 0000000000..366ffce5f7
--- /dev/null
+++ b/docs/source/main_classes/tokenizer.mdx
@@ -0,0 +1,77 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Tokenizer
+
+A tokenizer is in charge of preparing the inputs for a model. The library contains tokenizers for all the models. Most
+of the tokenizers are available in two flavors: a full python implementation and a "Fast" implementation based on the
+Rust library [🤗 Tokenizers](https://github.com/huggingface/tokenizers). The "Fast" implementations allows:
+
+1. a significant speed-up in particular when doing batched tokenization and
+2. additional methods to map between the original string (character and words) and the token space (e.g. getting the
+   index of the token comprising a given character or the span of characters corresponding to a given token). Currently
+   no "Fast" implementation is available for the SentencePiece-based tokenizers (for T5, ALBERT, CamemBERT, XLM-RoBERTa
+   and XLNet models).
+
+The base classes [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`]
+implement the common methods for encoding string inputs in model inputs (see below) and instantiating/saving python and
+"Fast" tokenizers either from a local file or directory or from a pretrained tokenizer provided by the library
+(downloaded from HuggingFace's AWS S3 repository). They both rely on
+[`~tokenization_utils_base.PreTrainedTokenizerBase`] that contains the common methods, and
+[`~tokenization_utils_base.SpecialTokensMixin`].
+
+[`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`] thus implement the main
+methods for using all the tokenizers:
+
+- Tokenizing (splitting strings in sub-word token strings), converting tokens strings to ids and back, and
+  encoding/decoding (i.e., tokenizing and converting to integers).
+- Adding new tokens to the vocabulary in a way that is independent of the underlying structure (BPE, SentencePiece...).
+- Managing special tokens (like mask, beginning-of-sentence, etc.): adding them, assigning them to attributes in the
+  tokenizer for easy access and making sure they are not split during tokenization.
+
+[`BatchEncoding`] holds the output of the
+[`~tokenization_utils_base.PreTrainedTokenizerBase`]'s encoding methods (`__call__`,
+`encode_plus` and `batch_encode_plus`) and is derived from a Python dictionary. When the tokenizer is a pure python
+tokenizer, this class behaves just like a standard python dictionary and holds the various model inputs computed by
+these methods (`input_ids`, `attention_mask`...). When the tokenizer is a "Fast" tokenizer (i.e., backed by
+HuggingFace [tokenizers library](https://github.com/huggingface/tokenizers)), this class provides in addition
+several advanced alignment methods which can be used to map between the original string (character and words) and the
+token space (e.g., getting the index of the token comprising a given character or the span of characters corresponding
+to a given token).
+
+
+## PreTrainedTokenizer
+
+[[autodoc]] PreTrainedTokenizer
+    - __call__
+    - batch_decode
+    - decode
+    - encode
+    - push_to_hub
+    - all
+
+## PreTrainedTokenizerFast
+
+The [`PreTrainedTokenizerFast`] depend on the [tokenizers](https://huggingface.co/docs/tokenizers) library. The tokenizers obtained from the 🤗 tokenizers library can be
+loaded very simply into 🤗 transformers. Take a look at the [Using tokenizers from 🤗 tokenizers](../fast_tokenizers) page to understand how this is done.
+
+[[autodoc]] PreTrainedTokenizerFast
+    - __call__
+    - batch_decode
+    - decode
+    - encode
+    - push_to_hub
+    - all
+
+## BatchEncoding
+
+[[autodoc]] BatchEncoding
diff --git a/docs/source/main_classes/tokenizer.rst b/docs/source/main_classes/tokenizer.rst
deleted file mode 100644
index 7b591a5e9a..0000000000
--- a/docs/source/main_classes/tokenizer.rst
+++ /dev/null
@@ -1,78 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Tokenizer
------------------------------------------------------------------------------------------------------------------------
-
-A tokenizer is in charge of preparing the inputs for a model. The library contains tokenizers for all the models. Most
-of the tokenizers are available in two flavors: a full python implementation and a "Fast" implementation based on the
-Rust library `tokenizers <https://github.com/huggingface/tokenizers>`__. The "Fast" implementations allows:
-
-1. a significant speed-up in particular when doing batched tokenization and
-2. additional methods to map between the original string (character and words) and the token space (e.g. getting the
-   index of the token comprising a given character or the span of characters corresponding to a given token). Currently
-   no "Fast" implementation is available for the SentencePiece-based tokenizers (for T5, ALBERT, CamemBERT, XLM-RoBERTa
-   and XLNet models).
-
-The base classes :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast`
-implement the common methods for encoding string inputs in model inputs (see below) and instantiating/saving python and
-"Fast" tokenizers either from a local file or directory or from a pretrained tokenizer provided by the library
-(downloaded from HuggingFace's AWS S3 repository). They both rely on
-:class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase` that contains the common methods, and
-:class:`~transformers.tokenization_utils_base.SpecialTokensMixin`.
-
-:class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast` thus implement the main
-methods for using all the tokenizers:
-
-- Tokenizing (splitting strings in sub-word token strings), converting tokens strings to ids and back, and
-  encoding/decoding (i.e., tokenizing and converting to integers).
-- Adding new tokens to the vocabulary in a way that is independent of the underlying structure (BPE, SentencePiece...).
-- Managing special tokens (like mask, beginning-of-sentence, etc.): adding them, assigning them to attributes in the
-  tokenizer for easy access and making sure they are not split during tokenization.
-
-:class:`~transformers.BatchEncoding` holds the output of the
-:class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase`'s encoding methods (``__call__``,
-``encode_plus`` and ``batch_encode_plus``) and is derived from a Python dictionary. When the tokenizer is a pure python
-tokenizer, this class behaves just like a standard python dictionary and holds the various model inputs computed by
-these methods (``input_ids``, ``attention_mask``...). When the tokenizer is a "Fast" tokenizer (i.e., backed by
-HuggingFace `tokenizers library <https://github.com/huggingface/tokenizers>`__), this class provides in addition
-several advanced alignment methods which can be used to map between the original string (character and words) and the
-token space (e.g., getting the index of the token comprising a given character or the span of characters corresponding
-to a given token).
-
-
-PreTrainedTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.PreTrainedTokenizer
-    :special-members: __call__, batch_decode, decode, encode, push_to_hub
-    :members: 
-
-
-PreTrainedTokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The :class:`~transformers.PreTrainedTokenizerFast` depend on the `tokenizers
-<https://huggingface.co/docs/tokenizers>`__ library. The tokenizers obtained from the 🤗 tokenizers library can be
-loaded very simply into 🤗 transformers. Take a look at the :doc:`Using tokenizers from 🤗 tokenizers
-<../fast_tokenizers>` page to understand how this is done.
-
-.. autoclass:: transformers.PreTrainedTokenizerFast
-    :special-members: __call__, batch_decode, decode, encode, push_to_hub
-    :members:
-
-
-BatchEncoding
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BatchEncoding
-    :members:
diff --git a/docs/source/migration.md b/docs/source/migration.mdx
similarity index 100%
rename from docs/source/migration.md
rename to docs/source/migration.mdx
diff --git a/docs/source/model_doc/mbart.mdx b/docs/source/model_doc/mbart.mdx
index e08b155cc9..d61fb33d09 100644
--- a/docs/source/model_doc/mbart.mdx
+++ b/docs/source/model_doc/mbart.mdx
@@ -73,8 +73,7 @@ inside the context manager [`~MBartTokenizer.as_target_tokenizer`] to encode tar
 
 ## Overview of MBart-50
 
-MBart-50 was introduced in the *Multilingual Translation with Extensible Multilingual Pretraining and Finetuning
-<https://arxiv.org/abs/2008.00401>* paper by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav
+MBart-50 was introduced in the [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) paper by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav
 Chaudhary, Jiatao Gu, Angela Fan. MBart-50 is created using the original *mbart-large-cc25* checkpoint by extendeding
 its embedding layers with randomly initialized vectors for an extra set of 25 language tokens and then pretrained on 50
 languages.
diff --git a/docs/source/model_doc/megatron_bert.mdx b/docs/source/model_doc/megatron_bert.mdx
new file mode 100644
index 0000000000..911bf76aec
--- /dev/null
+++ b/docs/source/model_doc/megatron_bert.mdx
@@ -0,0 +1,128 @@
+<!--Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# MegatronBERT
+
+## Overview
+
+The MegatronBERT model was proposed in [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model
+Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley,
+Jared Casper and Bryan Catanzaro.
+
+The abstract from the paper is the following:
+
+*Recent work in language modeling demonstrates that training large transformer models advances the state of the art in
+Natural Language Processing applications. However, very large models can be quite difficult to train due to memory
+constraints. In this work, we present our techniques for training very large transformer models and implement a simple,
+efficient intra-layer model parallel approach that enables training transformer models with billions of parameters. Our
+approach does not require a new compiler or library changes, is orthogonal and complimentary to pipeline model
+parallelism, and can be fully implemented with the insertion of a few communication operations in native PyTorch. We
+illustrate this approach by converging transformer based models up to 8.3 billion parameters using 512 GPUs. We sustain
+15.1 PetaFLOPs across the entire application with 76% scaling efficiency when compared to a strong single GPU baseline
+that sustains 39 TeraFLOPs, which is 30% of peak FLOPs. To demonstrate that large language models can further advance
+the state of the art (SOTA), we train an 8.3 billion parameter transformer language model similar to GPT-2 and a 3.9
+billion parameter model similar to BERT. We show that careful attention to the placement of layer normalization in
+BERT-like models is critical to achieving increased performance as the model size grows. Using the GPT-2 model we
+achieve SOTA results on the WikiText103 (10.8 compared to SOTA perplexity of 15.8) and LAMBADA (66.5% compared to SOTA
+accuracy of 63.2%) datasets. Our BERT model achieves SOTA results on the RACE dataset (90.9% compared to SOTA accuracy
+of 89.4%).*
+
+Tips:
+
+We have provided pretrained [BERT-345M](https://ngc.nvidia.com/catalog/models/nvidia:megatron_bert_345m) checkpoints
+for use to evaluate or finetuning downstream tasks.
+
+To access these checkpoints, first [sign up](https://ngc.nvidia.com/signup) for and setup the NVIDIA GPU Cloud (NGC)
+Registry CLI. Further documentation for downloading models can be found in the [NGC documentation](https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1).
+
+Alternatively, you can directly download the checkpoints using:
+
+BERT-345M-uncased:
+
+```bash
+wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_uncased/zip
+-O megatron_bert_345m_v0_1_uncased.zip
+```
+
+BERT-345M-cased:
+
+```bash
+wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/zip -O
+megatron_bert_345m_v0_1_cased.zip
+```
+
+Once you have obtained the checkpoints from NVIDIA GPU Cloud (NGC), you have to convert them to a format that will
+easily be loaded by Hugging Face Transformers and our port of the BERT code.
+
+The following commands allow you to do the conversion. We assume that the folder `models/megatron_bert` contains
+`megatron_bert_345m_v0_1_{cased, uncased}.zip` and that the commands are run from inside that folder:
+
+```bash
+python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_uncased.zip
+```
+
+```bash
+python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_cased.zip
+```
+
+This model was contributed by [jdemouth](https://huggingface.co/jdemouth). The original code can be found [here](https://github.com/NVIDIA/Megatron-LM). That repository contains a multi-GPU and multi-node implementation of the
+Megatron Language models. In particular, it contains a hybrid model parallel approach using "tensor parallel" and
+"pipeline parallel" techniques.
+
+## MegatronBertConfig
+
+[[autodoc]] MegatronBertConfig
+
+## MegatronBertModel
+
+[[autodoc]] MegatronBertModel
+    - forward
+
+## MegatronBertForMaskedLM
+
+[[autodoc]] MegatronBertForMaskedLM
+    - forward
+
+## MegatronBertForCausalLM
+
+[[autodoc]] MegatronBertForCausalLM
+    - forward
+
+## MegatronBertForNextSentencePrediction
+
+[[autodoc]] MegatronBertForNextSentencePrediction
+    - forward
+
+## MegatronBertForPreTraining
+
+[[autodoc]] MegatronBertForPreTraining
+    - forward
+
+## MegatronBertForSequenceClassification
+
+[[autodoc]] MegatronBertForSequenceClassification
+    - forward
+
+## MegatronBertForMultipleChoice
+
+[[autodoc]] MegatronBertForMultipleChoice
+    - forward
+
+## MegatronBertForTokenClassification
+
+[[autodoc]] MegatronBertForTokenClassification
+    - forward
+
+## MegatronBertForQuestionAnswering
+
+[[autodoc]] MegatronBertForQuestionAnswering
+    - forward
diff --git a/docs/source/model_doc/megatron_bert.rst b/docs/source/model_doc/megatron_bert.rst
deleted file mode 100644
index 89e690734d..0000000000
--- a/docs/source/model_doc/megatron_bert.rst
+++ /dev/null
@@ -1,154 +0,0 @@
-.. 
-    Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-MegatronBERT
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The MegatronBERT model was proposed in `Megatron-LM: Training Multi-Billion Parameter Language Models Using Model
-Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley,
-Jared Casper and Bryan Catanzaro.
-
-The abstract from the paper is the following:
-
-*Recent work in language modeling demonstrates that training large transformer models advances the state of the art in
-Natural Language Processing applications. However, very large models can be quite difficult to train due to memory
-constraints. In this work, we present our techniques for training very large transformer models and implement a simple,
-efficient intra-layer model parallel approach that enables training transformer models with billions of parameters. Our
-approach does not require a new compiler or library changes, is orthogonal and complimentary to pipeline model
-parallelism, and can be fully implemented with the insertion of a few communication operations in native PyTorch. We
-illustrate this approach by converging transformer based models up to 8.3 billion parameters using 512 GPUs. We sustain
-15.1 PetaFLOPs across the entire application with 76% scaling efficiency when compared to a strong single GPU baseline
-that sustains 39 TeraFLOPs, which is 30% of peak FLOPs. To demonstrate that large language models can further advance
-the state of the art (SOTA), we train an 8.3 billion parameter transformer language model similar to GPT-2 and a 3.9
-billion parameter model similar to BERT. We show that careful attention to the placement of layer normalization in
-BERT-like models is critical to achieving increased performance as the model size grows. Using the GPT-2 model we
-achieve SOTA results on the WikiText103 (10.8 compared to SOTA perplexity of 15.8) and LAMBADA (66.5% compared to SOTA
-accuracy of 63.2%) datasets. Our BERT model achieves SOTA results on the RACE dataset (90.9% compared to SOTA accuracy
-of 89.4%).*
-
-Tips:
-
-We have provided pretrained `BERT-345M <https://ngc.nvidia.com/catalog/models/nvidia:megatron_bert_345m>`__ checkpoints
-for use to evaluate or finetuning downstream tasks.
-
-To access these checkpoints, first `sign up <https://ngc.nvidia.com/signup>`__ for and setup the NVIDIA GPU Cloud (NGC)
-Registry CLI. Further documentation for downloading models can be found in the `NGC documentation
-<https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1>`__.
-
-Alternatively, you can directly download the checkpoints using:
-
-BERT-345M-uncased::
-
-.. code-block:: bash
-
-    wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_uncased/zip
-    -O megatron_bert_345m_v0_1_uncased.zip
-
-BERT-345M-cased::
-
-.. code-block:: bash
-
-    wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/zip -O
-    megatron_bert_345m_v0_1_cased.zip
-
-Once you have obtained the checkpoints from NVIDIA GPU Cloud (NGC), you have to convert them to a format that will
-easily be loaded by Hugging Face Transformers and our port of the BERT code.
-
-The following commands allow you to do the conversion. We assume that the folder ``models/megatron_bert`` contains
-``megatron_bert_345m_v0_1_{cased, uncased}.zip`` and that the commands are run from inside that folder::
-
-.. code-block:: bash
-
-    python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_uncased.zip 
-
-.. code-block:: bash
-
-    python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_cased.zip
-
-This model was contributed by `jdemouth <https://huggingface.co/jdemouth>`__. The original code can be found `here
-<https://github.com/NVIDIA/Megatron-LM>`__. That repository contains a multi-GPU and multi-node implementation of the
-Megatron Language models. In particular, it contains a hybrid model parallel approach using "tensor parallel" and
-"pipeline parallel" techniques.
-
-MegatronBertConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MegatronBertConfig
-    :members:
-
-
-MegatronBertModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MegatronBertModel
-    :members: forward
-
-
-MegatronBertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MegatronBertForMaskedLM
-    :members: forward
-
-
-MegatronBertForCausalLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MegatronBertForCausalLM
-    :members: forward
-
-
-MegatronBertForNextSentencePrediction
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MegatronBertForNextSentencePrediction
-    :members: forward
-
-
-MegatronBertForPreTraining
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MegatronBertForPreTraining
-    :members: forward
-
-
-MegatronBertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MegatronBertForSequenceClassification
-    :members: forward
-
-
-MegatronBertForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MegatronBertForMultipleChoice
-    :members: forward
-
-
-MegatronBertForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MegatronBertForTokenClassification
-    :members: forward
-
-
-MegatronBertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MegatronBertForQuestionAnswering
-    :members: forward
-
-
diff --git a/docs/source/model_doc/megatron_gpt2.rst b/docs/source/model_doc/megatron_gpt2.mdx
similarity index 52%
rename from docs/source/model_doc/megatron_gpt2.rst
rename to docs/source/model_doc/megatron_gpt2.mdx
index 4ec7e1b30a..a0d91e5a16 100644
--- a/docs/source/model_doc/megatron_gpt2.rst
+++ b/docs/source/model_doc/megatron_gpt2.mdx
@@ -1,23 +1,21 @@
-.. 
-    Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
+<!--Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
 
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
 
-        http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
 
-MegatronGPT2
------------------------------------------------------------------------------------------------------------------------
+# MegatronGPT2
 
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+## Overview
 
-The MegatronGPT2 model was proposed in `Megatron-LM: Training Multi-Billion Parameter Language Models Using Model
-Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley,
+The MegatronGPT2 model was proposed in [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model
+Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley,
 Jared Casper and Bryan Catanzaro.
 
 The abstract from the paper is the following:
@@ -40,32 +38,30 @@ of 89.4%).*
 
 Tips:
 
-We have provided pretrained `GPT2-345M <https://ngc.nvidia.com/catalog/models/nvidia:megatron_lm_345m>`__ checkpoints
+We have provided pretrained [GPT2-345M](https://ngc.nvidia.com/catalog/models/nvidia:megatron_lm_345m) checkpoints
 for use to evaluate or finetuning downstream tasks.
 
-To access these checkpoints, first `sign up <https://ngc.nvidia.com/signup>`__ for and setup the NVIDIA GPU Cloud (NGC)
-Registry CLI. Further documentation for downloading models can be found in the `NGC documentation
-<https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1>`__.
+To access these checkpoints, first [sign up](https://ngc.nvidia.com/signup) for and setup the NVIDIA GPU Cloud (NGC)
+Registry CLI. Further documentation for downloading models can be found in the [NGC documentation](https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1).
 
-Alternatively, you can directly download the checkpoints using::
+Alternatively, you can directly download the checkpoints using:
 
-.. code-block:: bash
-
-    wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O
-    megatron_gpt2_345m_v0_0.zip
+```bash
+wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O
+megatron_gpt2_345m_v0_0.zip
+```
 
 Once you have obtained the checkpoint from NVIDIA GPU Cloud (NGC), you have to convert it to a format that will easily
 be loaded by Hugging Face Transformers GPT2 implementation.
 
-The following command allows you to do the conversion. We assume that the folder ``models/megatron_gpt2`` contains
-``megatron_gpt2_345m_v0_0.zip`` and that the command is run from that folder::
+The following command allows you to do the conversion. We assume that the folder `models/megatron_gpt2` contains
+`megatron_gpt2_345m_v0_0.zip` and that the command is run from that folder:
 
-.. code-block:: bash
+```bash
+python3 $PATH_TO_TRANSFORMERS/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py megatron_gpt2_345m_v0_0.zip
+```
 
-    python3 $PATH_TO_TRANSFORMERS/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py megatron_gpt2_345m_v0_0.zip
-
-This model was contributed by `jdemouth <https://huggingface.co/jdemouth>`__. The original code can be found `here
-<https://github.com/NVIDIA/Megatron-LM>`__. That repository contains a multi-GPU and multi-node implementation of the
+This model was contributed by [jdemouth](https://huggingface.co/jdemouth). The original code can be found [here](https://github.com/NVIDIA/Megatron-LM). That repository contains a multi-GPU and multi-node implementation of the
 Megatron Language models. In particular, it contains a hybrid model parallel approach using "tensor parallel" and
 "pipeline parallel" techniques.
 
diff --git a/docs/source/model_doc/mluke.mdx b/docs/source/model_doc/mluke.mdx
new file mode 100644
index 0000000000..ac1f3fb19f
--- /dev/null
+++ b/docs/source/model_doc/mluke.mdx
@@ -0,0 +1,61 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# mLUKE
+
+## Overview
+
+The mLUKE model was proposed in [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka. It's a multilingual extension
+of the [LUKE model](https://arxiv.org/abs/2010.01057) trained on the basis of XLM-RoBERTa.
+
+It is based on XLM-RoBERTa and adds entity embeddings, which helps improve performance on various downstream tasks
+involving reasoning about entities such as named entity recognition, extractive question answering, relation
+classification, cloze-style knowledge completion.
+
+The abstract from the paper is the following:
+
+*Recent studies have shown that multilingual pretrained language models can be effectively improved with cross-lingual
+alignment information from Wikipedia entities. However, existing methods only exploit entity information in pretraining
+and do not explicitly use entities in downstream tasks. In this study, we explore the effectiveness of leveraging
+entity representations for downstream cross-lingual tasks. We train a multilingual language model with 24 languages
+with entity representations and show the model consistently outperforms word-based pretrained models in various
+cross-lingual transfer tasks. We also analyze the model and the key insight is that incorporating entity
+representations into the input allows us to extract more language-agnostic features. We also evaluate the model with a
+multilingual cloze prompt task with the mLAMA dataset. We show that entity-based prompt elicits correct factual
+knowledge more likely than using only word representations.*
+
+One can directly plug in the weights of mLUKE into a LUKE model, like so:
+
+```python
+from transformers import LukeModel
+
+model = LukeModel.from_pretrained('studio-ousia/mluke-base')
+```
+
+Note that mLUKE has its own tokenizer, [`MLukeTokenizer`]. You can initialize it as follows:
+
+```python
+from transformers import MLukeTokenizer
+
+tokenizer = MLukeTokenizer.from_pretrained('studio-ousia/mluke-base')
+```
+
+As mLUKE's architecture is equivalent to that of LUKE, one can refer to [LUKE's documentation page](luke) for all
+tips, code examples and notebooks.
+
+This model was contributed by [ryo0634](https://huggingface.co/ryo0634). The original code can be found [here](https://github.com/studio-ousia/luke).
+
+## MLukeTokenizer
+
+[[autodoc]] MLukeTokenizer
+    - __call__
+    - save_vocabulary
diff --git a/docs/source/model_doc/mluke.rst b/docs/source/model_doc/mluke.rst
deleted file mode 100644
index ddca5fcec6..0000000000
--- a/docs/source/model_doc/mluke.rst
+++ /dev/null
@@ -1,66 +0,0 @@
-..
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-mLUKE
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The mLUKE model was proposed in `mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models
-<https://arxiv.org/abs/2110.08151>`__ by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka. It's a multilingual extension
-of the `LUKE model <https://arxiv.org/abs/2010.01057>`__ trained on the basis of XLM-RoBERTa.
-
-It is based on XLM-RoBERTa and adds entity embeddings, which helps improve performance on various downstream tasks
-involving reasoning about entities such as named entity recognition, extractive question answering, relation
-classification, cloze-style knowledge completion.
-
-The abstract from the paper is the following:
-
-*Recent studies have shown that multilingual pretrained language models can be effectively improved with cross-lingual
-alignment information from Wikipedia entities. However, existing methods only exploit entity information in pretraining
-and do not explicitly use entities in downstream tasks. In this study, we explore the effectiveness of leveraging
-entity representations for downstream cross-lingual tasks. We train a multilingual language model with 24 languages
-with entity representations and show the model consistently outperforms word-based pretrained models in various
-cross-lingual transfer tasks. We also analyze the model and the key insight is that incorporating entity
-representations into the input allows us to extract more language-agnostic features. We also evaluate the model with a
-multilingual cloze prompt task with the mLAMA dataset. We show that entity-based prompt elicits correct factual
-knowledge more likely than using only word representations.*
-
-One can directly plug in the weights of mLUKE into a LUKE model, like so:
-
-.. code-block::
-
-    from transformers import LukeModel
-
-    model = LukeModel.from_pretrained('studio-ousia/mluke-base')
-
-Note that mLUKE has its own tokenizer, :class:`~transformers.MLukeTokenizer`. You can initialize it as follows:
-
-.. code-block::
-
-    from transformers import MLukeTokenizer
-
-    tokenizer = MLukeTokenizer.from_pretrained('studio-ousia/mluke-base')
-
-
-As mLUKE's architecture is equivalent to that of LUKE, one can refer to :doc:`LUKE's documentation page <luke>` for all
-tips, code examples and notebooks.
-
-This model was contributed by `ryo0634 <https://huggingface.co/ryo0634>`__. The original code can be found `here
-<https://github.com/studio-ousia/luke>`__.
-
-MLukeTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MLukeTokenizer
-    :members: __call__, save_vocabulary
diff --git a/docs/source/model_doc/mobilebert.mdx b/docs/source/model_doc/mobilebert.mdx
new file mode 100644
index 0000000000..8305903d23
--- /dev/null
+++ b/docs/source/model_doc/mobilebert.mdx
@@ -0,0 +1,142 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# MobileBERT
+
+## Overview
+
+The MobileBERT model was proposed in [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny
+Zhou. It's a bidirectional transformer based on the BERT model, which is compressed and accelerated using several
+approaches.
+
+The abstract from the paper is the following:
+
+*Natural Language Processing (NLP) has recently achieved great success by using huge pre-trained models with hundreds
+of millions of parameters. However, these models suffer from heavy model sizes and high latency such that they cannot
+be deployed to resource-limited mobile devices. In this paper, we propose MobileBERT for compressing and accelerating
+the popular BERT model. Like the original BERT, MobileBERT is task-agnostic, that is, it can be generically applied to
+various downstream NLP tasks via simple fine-tuning. Basically, MobileBERT is a thin version of BERT_LARGE, while
+equipped with bottleneck structures and a carefully designed balance between self-attentions and feed-forward networks.
+To train MobileBERT, we first train a specially designed teacher model, an inverted-bottleneck incorporated BERT_LARGE
+model. Then, we conduct knowledge transfer from this teacher to MobileBERT. Empirical studies show that MobileBERT is
+4.3x smaller and 5.5x faster than BERT_BASE while achieving competitive results on well-known benchmarks. On the
+natural language inference tasks of GLUE, MobileBERT achieves a GLUEscore o 77.7 (0.6 lower than BERT_BASE), and 62 ms
+latency on a Pixel 4 phone. On the SQuAD v1.1/v2.0 question answering task, MobileBERT achieves a dev F1 score of
+90.0/79.2 (1.5/2.1 higher than BERT_BASE).*
+
+Tips:
+
+- MobileBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather
+  than the left.
+- MobileBERT is similar to BERT and therefore relies on the masked language modeling (MLM) objective. It is therefore
+  efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation. Models trained
+  with a causal language modeling (CLM) objective are better in that regard.
+
+This model was contributed by [vshampor](https://huggingface.co/vshampor). The original code can be found [here](https://github.com/google-research/mobilebert).
+
+## MobileBertConfig
+
+[[autodoc]] MobileBertConfig
+
+## MobileBertTokenizer
+
+[[autodoc]] MobileBertTokenizer
+
+## MobileBertTokenizerFast
+
+[[autodoc]] MobileBertTokenizerFast
+
+## MobileBert specific outputs
+
+[[autodoc]] models.mobilebert.modeling_mobilebert.MobileBertForPreTrainingOutput
+
+[[autodoc]] models.mobilebert.modeling_tf_mobilebert.TFMobileBertForPreTrainingOutput
+
+## MobileBertModel
+
+[[autodoc]] MobileBertModel
+    - forward
+
+## MobileBertForPreTraining
+
+[[autodoc]] MobileBertForPreTraining
+    - forward
+
+## MobileBertForMaskedLM
+
+[[autodoc]] MobileBertForMaskedLM
+    - forward
+
+## MobileBertForNextSentencePrediction
+
+[[autodoc]] MobileBertForNextSentencePrediction
+    - forward
+
+## MobileBertForSequenceClassification
+
+[[autodoc]] MobileBertForSequenceClassification
+    - forward
+
+## MobileBertForMultipleChoice
+
+[[autodoc]] MobileBertForMultipleChoice
+    - forward
+
+## MobileBertForTokenClassification
+
+[[autodoc]] MobileBertForTokenClassification
+    - forward
+
+## MobileBertForQuestionAnswering
+
+[[autodoc]] MobileBertForQuestionAnswering
+    - forward
+
+## TFMobileBertModel
+
+[[autodoc]] TFMobileBertModel
+    - call
+
+## TFMobileBertForPreTraining
+
+[[autodoc]] TFMobileBertForPreTraining
+    - call
+
+## TFMobileBertForMaskedLM
+
+[[autodoc]] TFMobileBertForMaskedLM
+    - call
+
+## TFMobileBertForNextSentencePrediction
+
+[[autodoc]] TFMobileBertForNextSentencePrediction
+    - call
+
+## TFMobileBertForSequenceClassification
+
+[[autodoc]] TFMobileBertForSequenceClassification
+    - call
+
+## TFMobileBertForMultipleChoice
+
+[[autodoc]] TFMobileBertForMultipleChoice
+    - call
+
+## TFMobileBertForTokenClassification
+
+[[autodoc]] TFMobileBertForTokenClassification
+    - call
+
+## TFMobileBertForQuestionAnswering
+
+[[autodoc]] TFMobileBertForQuestionAnswering
+    - call
diff --git a/docs/source/model_doc/mobilebert.rst b/docs/source/model_doc/mobilebert.rst
deleted file mode 100644
index 9166e382c9..0000000000
--- a/docs/source/model_doc/mobilebert.rst
+++ /dev/null
@@ -1,190 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-MobileBERT
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The MobileBERT model was proposed in `MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices
-<https://arxiv.org/abs/2004.02984>`__ by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny
-Zhou. It's a bidirectional transformer based on the BERT model, which is compressed and accelerated using several
-approaches.
-
-The abstract from the paper is the following:
-
-*Natural Language Processing (NLP) has recently achieved great success by using huge pre-trained models with hundreds
-of millions of parameters. However, these models suffer from heavy model sizes and high latency such that they cannot
-be deployed to resource-limited mobile devices. In this paper, we propose MobileBERT for compressing and accelerating
-the popular BERT model. Like the original BERT, MobileBERT is task-agnostic, that is, it can be generically applied to
-various downstream NLP tasks via simple fine-tuning. Basically, MobileBERT is a thin version of BERT_LARGE, while
-equipped with bottleneck structures and a carefully designed balance between self-attentions and feed-forward networks.
-To train MobileBERT, we first train a specially designed teacher model, an inverted-bottleneck incorporated BERT_LARGE
-model. Then, we conduct knowledge transfer from this teacher to MobileBERT. Empirical studies show that MobileBERT is
-4.3x smaller and 5.5x faster than BERT_BASE while achieving competitive results on well-known benchmarks. On the
-natural language inference tasks of GLUE, MobileBERT achieves a GLUEscore o 77.7 (0.6 lower than BERT_BASE), and 62 ms
-latency on a Pixel 4 phone. On the SQuAD v1.1/v2.0 question answering task, MobileBERT achieves a dev F1 score of
-90.0/79.2 (1.5/2.1 higher than BERT_BASE).*
-
-Tips:
-
-- MobileBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather
-  than the left.
-- MobileBERT is similar to BERT and therefore relies on the masked language modeling (MLM) objective. It is therefore
-  efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation. Models trained
-  with a causal language modeling (CLM) objective are better in that regard.
-
-This model was contributed by `vshampor <https://huggingface.co/vshampor>`__. The original code can be found `here
-<https://github.com/google-research/mobilebert>`__.
-
-MobileBertConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MobileBertConfig
-    :members:
-
-
-MobileBertTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MobileBertTokenizer
-    :members:
-
-
-MobileBertTokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MobileBertTokenizerFast
-    :members:
-
-
-MobileBert specific outputs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.models.mobilebert.modeling_mobilebert.MobileBertForPreTrainingOutput
-    :members:
-
-.. autoclass:: transformers.models.mobilebert.modeling_tf_mobilebert.TFMobileBertForPreTrainingOutput
-    :members:
-
-
-MobileBertModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MobileBertModel
-    :members: forward
-
-
-MobileBertForPreTraining
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MobileBertForPreTraining
-    :members: forward
-
-
-MobileBertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MobileBertForMaskedLM
-    :members: forward
-
-
-MobileBertForNextSentencePrediction
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MobileBertForNextSentencePrediction
-    :members: forward
-
-
-MobileBertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MobileBertForSequenceClassification
-    :members: forward
-
-
-MobileBertForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MobileBertForMultipleChoice
-    :members: forward
-
-
-MobileBertForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MobileBertForTokenClassification
-    :members: forward
-
-
-MobileBertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MobileBertForQuestionAnswering
-    :members: forward
-
-
-TFMobileBertModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFMobileBertModel
-    :members: call
-
-
-TFMobileBertForPreTraining
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFMobileBertForPreTraining
-    :members: call
-
-
-TFMobileBertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFMobileBertForMaskedLM
-    :members: call
-
-
-TFMobileBertForNextSentencePrediction
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFMobileBertForNextSentencePrediction
-    :members: call
-
-
-TFMobileBertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFMobileBertForSequenceClassification
-    :members: call
-
-
-TFMobileBertForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFMobileBertForMultipleChoice
-    :members: call
-
-
-TFMobileBertForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFMobileBertForTokenClassification
-    :members: call
-
-
-TFMobileBertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFMobileBertForQuestionAnswering
-    :members: call
diff --git a/docs/source/model_doc/mpnet.mdx b/docs/source/model_doc/mpnet.mdx
new file mode 100644
index 0000000000..0fa88ee87b
--- /dev/null
+++ b/docs/source/model_doc/mpnet.mdx
@@ -0,0 +1,117 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# MPNet
+
+## Overview
+
+The MPNet model was proposed in [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+
+MPNet adopts a novel pre-training method, named masked and permuted language modeling, to inherit the advantages of
+masked language modeling and permuted language modeling for natural language understanding.
+
+The abstract from the paper is the following:
+
+*BERT adopts masked language modeling (MLM) for pre-training and is one of the most successful pre-training models.
+Since BERT neglects dependency among predicted tokens, XLNet introduces permuted language modeling (PLM) for
+pre-training to address this problem. However, XLNet does not leverage the full position information of a sentence and
+thus suffers from position discrepancy between pre-training and fine-tuning. In this paper, we propose MPNet, a novel
+pre-training method that inherits the advantages of BERT and XLNet and avoids their limitations. MPNet leverages the
+dependency among predicted tokens through permuted language modeling (vs. MLM in BERT), and takes auxiliary position
+information as input to make the model see a full sentence and thus reducing the position discrepancy (vs. PLM in
+XLNet). We pre-train MPNet on a large-scale dataset (over 160GB text corpora) and fine-tune on a variety of
+down-streaming tasks (GLUE, SQuAD, etc). Experimental results show that MPNet outperforms MLM and PLM by a large
+margin, and achieves better results on these tasks compared with previous state-of-the-art pre-trained methods (e.g.,
+BERT, XLNet, RoBERTa) under the same model setting.*
+
+Tips:
+
+- MPNet doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. just
+  separate your segments with the separation token `tokenizer.sep_token` (or `[sep]`).
+
+The original code can be found [here](https://github.com/microsoft/MPNet).
+
+## MPNetConfig
+
+[[autodoc]] MPNetConfig
+
+## MPNetTokenizer
+
+[[autodoc]] MPNetTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## MPNetTokenizerFast
+
+[[autodoc]] MPNetTokenizerFast
+
+## MPNetModel
+
+[[autodoc]] MPNetModel
+    - forward
+
+## MPNetForMaskedLM
+
+[[autodoc]] MPNetForMaskedLM
+    - forward
+
+## MPNetForSequenceClassification
+
+[[autodoc]] MPNetForSequenceClassification
+    - forward
+
+## MPNetForMultipleChoice
+
+[[autodoc]] MPNetForMultipleChoice
+    - forward
+
+## MPNetForTokenClassification
+
+[[autodoc]] MPNetForTokenClassification
+    - forward
+
+## MPNetForQuestionAnswering
+
+[[autodoc]] MPNetForQuestionAnswering
+    - forward
+
+## TFMPNetModel
+
+[[autodoc]] TFMPNetModel
+    - call
+
+## TFMPNetForMaskedLM
+
+[[autodoc]] TFMPNetForMaskedLM
+    - call
+
+## TFMPNetForSequenceClassification
+
+[[autodoc]] TFMPNetForSequenceClassification
+    - call
+
+## TFMPNetForMultipleChoice
+
+[[autodoc]] TFMPNetForMultipleChoice
+    - call
+
+## TFMPNetForTokenClassification
+
+[[autodoc]] TFMPNetForTokenClassification
+    - call
+
+## TFMPNetForQuestionAnswering
+
+[[autodoc]] TFMPNetForQuestionAnswering
+    - call
diff --git a/docs/source/model_doc/mpnet.rst b/docs/source/model_doc/mpnet.rst
deleted file mode 100644
index e41bd07869..0000000000
--- a/docs/source/model_doc/mpnet.rst
+++ /dev/null
@@ -1,149 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-MPNet
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The MPNet model was proposed in `MPNet: Masked and Permuted Pre-training for Language Understanding
-<https://arxiv.org/abs/2004.09297>`__ by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-
-MPNet adopts a novel pre-training method, named masked and permuted language modeling, to inherit the advantages of
-masked language modeling and permuted language modeling for natural language understanding.
-
-The abstract from the paper is the following:
-
-*BERT adopts masked language modeling (MLM) for pre-training and is one of the most successful pre-training models.
-Since BERT neglects dependency among predicted tokens, XLNet introduces permuted language modeling (PLM) for
-pre-training to address this problem. However, XLNet does not leverage the full position information of a sentence and
-thus suffers from position discrepancy between pre-training and fine-tuning. In this paper, we propose MPNet, a novel
-pre-training method that inherits the advantages of BERT and XLNet and avoids their limitations. MPNet leverages the
-dependency among predicted tokens through permuted language modeling (vs. MLM in BERT), and takes auxiliary position
-information as input to make the model see a full sentence and thus reducing the position discrepancy (vs. PLM in
-XLNet). We pre-train MPNet on a large-scale dataset (over 160GB text corpora) and fine-tune on a variety of
-down-streaming tasks (GLUE, SQuAD, etc). Experimental results show that MPNet outperforms MLM and PLM by a large
-margin, and achieves better results on these tasks compared with previous state-of-the-art pre-trained methods (e.g.,
-BERT, XLNet, RoBERTa) under the same model setting.*
-
-Tips:
-
-- MPNet doesn't have :obj:`token_type_ids`, you don't need to indicate which token belongs to which segment. just
-  separate your segments with the separation token :obj:`tokenizer.sep_token` (or :obj:`[sep]`).
-
-The original code can be found `here <https://github.com/microsoft/MPNet>`__.
-
-MPNetConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MPNetConfig
-    :members:
-
-
-MPNetTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MPNetTokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-MPNetTokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MPNetTokenizerFast
-    :members:
-
-
-MPNetModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MPNetModel
-    :members: forward
-
-
-MPNetForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MPNetForMaskedLM
-    :members: forward
-
-
-MPNetForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MPNetForSequenceClassification
-    :members: forward
-
-
-MPNetForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MPNetForMultipleChoice
-    :members: forward
-
-
-MPNetForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MPNetForTokenClassification
-    :members: forward
-
-
-MPNetForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MPNetForQuestionAnswering
-    :members: forward
-
-
-TFMPNetModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFMPNetModel
-    :members: call
-
-
-TFMPNetForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFMPNetForMaskedLM
-    :members: call
-
-
-TFMPNetForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFMPNetForSequenceClassification
-    :members: call
-
-
-TFMPNetForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFMPNetForMultipleChoice
-    :members: call
-
-
-TFMPNetForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFMPNetForTokenClassification
-    :members: call
-
-
-TFMPNetForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFMPNetForQuestionAnswering
-    :members: call
diff --git a/docs/source/model_doc/mt5.mdx b/docs/source/model_doc/mt5.mdx
new file mode 100644
index 0000000000..fa39e4ab2c
--- /dev/null
+++ b/docs/source/model_doc/mt5.mdx
@@ -0,0 +1,98 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# mT5
+
+## Overview
+
+The mT5 model was presented in [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya
+Siddhant, Aditya Barua, Colin Raffel.
+
+The abstract from the paper is the following:
+
+*The recent "Text-to-Text Transfer Transformer" (T5) leveraged a unified text-to-text format and scale to attain
+state-of-the-art results on a wide variety of English-language NLP tasks. In this paper, we introduce mT5, a
+multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 101 languages. We detail
+the design and modified training of mT5 and demonstrate its state-of-the-art performance on many multilingual
+benchmarks. We also describe a simple technique to prevent "accidental translation" in the zero-shot setting, where a
+generative model chooses to (partially) translate its prediction into the wrong language. All of the code and model
+checkpoints used in this work are publicly available.*
+
+Note: mT5 was only pre-trained on [mC4](https://huggingface.co/datasets/mc4) excluding any supervised training.
+Therefore, this model has to be fine-tuned before it is useable on a downstream task, unlike the original T5 model.
+Since mT5 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
+fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
+
+Google has released the following variants:
+
+- [google/mt5-small](https://huggingface.co/google/mt5-small)
+
+- [google/mt5-base](https://huggingface.co/google/mt5-base)
+
+- [google/mt5-large](https://huggingface.co/google/mt5-large)
+
+- [google/mt5-xl](https://huggingface.co/google/mt5-xl)
+
+- [google/mt5-xxl](https://huggingface.co/google/mt5-xxl).
+
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The original code can be
+found [here](https://github.com/google-research/multilingual-t5).
+
+## MT5Config
+
+[[autodoc]] MT5Config
+
+## MT5Tokenizer
+
+[[autodoc]] MT5Tokenizer
+
+See [`T5Tokenizer`] for all details.
+
+
+## MT5TokenizerFast
+
+[[autodoc]] MT5TokenizerFast
+
+See [`T5TokenizerFast`] for all details.
+
+
+## MT5Model
+
+[[autodoc]] MT5Model
+
+## MT5ForConditionalGeneration
+
+[[autodoc]] MT5ForConditionalGeneration
+
+## MT5EncoderModel
+
+[[autodoc]] MT5EncoderModel
+
+## TFMT5Model
+
+[[autodoc]] TFMT5Model
+
+## TFMT5ForConditionalGeneration
+
+[[autodoc]] TFMT5ForConditionalGeneration
+
+## TFMT5EncoderModel
+
+[[autodoc]] TFMT5EncoderModel
+
+## FlaxMT5Model
+
+[[autodoc]] FlaxMT5Model
+
+## FlaxMT5ForConditionalGeneration
+
+[[autodoc]] FlaxMT5ForConditionalGeneration
diff --git a/docs/source/model_doc/mt5.rst b/docs/source/model_doc/mt5.rst
deleted file mode 100644
index 64713086fd..0000000000
--- a/docs/source/model_doc/mt5.rst
+++ /dev/null
@@ -1,129 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-mT5
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The mT5 model was presented in `mT5: A massively multilingual pre-trained text-to-text transformer
-<https://arxiv.org/abs/2010.11934>`_ by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya
-Siddhant, Aditya Barua, Colin Raffel.
-
-The abstract from the paper is the following:
-
-*The recent "Text-to-Text Transfer Transformer" (T5) leveraged a unified text-to-text format and scale to attain
-state-of-the-art results on a wide variety of English-language NLP tasks. In this paper, we introduce mT5, a
-multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 101 languages. We detail
-the design and modified training of mT5 and demonstrate its state-of-the-art performance on many multilingual
-benchmarks. We also describe a simple technique to prevent "accidental translation" in the zero-shot setting, where a
-generative model chooses to (partially) translate its prediction into the wrong language. All of the code and model
-checkpoints used in this work are publicly available.*
-
-Note: mT5 was only pre-trained on `mC4 <https://huggingface.co/datasets/mc4>`__ excluding any supervised training.
-Therefore, this model has to be fine-tuned before it is useable on a downstream task, unlike the original T5 model.
-Since mT5 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
-fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
-
-Google has released the following variants:
-
-- `google/mt5-small <https://huggingface.co/google/mt5-small>`__
-
-- `google/mt5-base <https://huggingface.co/google/mt5-base>`__
-
-- `google/mt5-large <https://huggingface.co/google/mt5-large>`__
-
-- `google/mt5-xl <https://huggingface.co/google/mt5-xl>`__
-
-- `google/mt5-xxl <https://huggingface.co/google/mt5-xxl>`__.
-
-This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The original code can be
-found `here <https://github.com/google-research/multilingual-t5>`__.
-
-MT5Config
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MT5Config
-    :members:
-
-
-MT5Tokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MT5Tokenizer
-
-See :class:`~transformers.T5Tokenizer` for all details.
-
-
-MT5TokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MT5TokenizerFast
-
-See :class:`~transformers.T5TokenizerFast` for all details.
-
-
-MT5Model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MT5Model
-    :members:
-
-
-MT5ForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MT5ForConditionalGeneration
-    :members:
-
-
-MT5EncoderModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MT5EncoderModel
-    :members:
-
-
-TFMT5Model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFMT5Model
-    :members:
-
-
-TFMT5ForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFMT5ForConditionalGeneration
-    :members:
-
-
-TFMT5EncoderModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFMT5EncoderModel
-    :members:
-
-
-FlaxMT5Model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxMT5Model
-    :members:
-
-
-FlaxMT5ForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxMT5ForConditionalGeneration
-    :members:
diff --git a/docs/source/model_doc/pegasus.mdx b/docs/source/model_doc/pegasus.mdx
new file mode 100644
index 0000000000..c05fcf80fa
--- /dev/null
+++ b/docs/source/model_doc/pegasus.mdx
@@ -0,0 +1,137 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Pegasus
+
+**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=sshleifer&labels=&template=bug-report.md&title)
+and assign @patrickvonplaten.
+
+
+## Overview
+
+The Pegasus model was proposed in [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/pdf/1912.08777.pdf) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu on Dec 18, 2019.
+
+According to the abstract,
+
+- Pegasus' pretraining task is intentionally similar to summarization: important sentences are removed/masked from an
+  input document and are generated together as one output sequence from the remaining sentences, similar to an
+  extractive summary.
+- Pegasus achieves SOTA summarization performance on all 12 downstream tasks, as measured by ROUGE and human eval.
+
+This model was contributed by [sshleifer](https://huggingface.co/sshleifer). The Authors' code can be found [here](https://github.com/google-research/pegasus).
+
+
+## Checkpoints
+
+All the [checkpoints](https://huggingface.co/models?search=pegasus) are fine-tuned for summarization, besides
+*pegasus-large*, whence the other checkpoints are fine-tuned:
+
+- Each checkpoint is 2.2 GB on disk and 568M parameters.
+- FP16 is not supported (help/ideas on this appreciated!).
+- Summarizing xsum in fp32 takes about 400ms/sample, with default parameters on a v100 GPU.
+- Full replication results and correctly pre-processed data can be found in this [Issue](https://github.com/huggingface/transformers/issues/6844#issue-689259666).
+- [Distilled checkpoints](https://huggingface.co/models?search=distill-pegasus) are described in this [paper](https://arxiv.org/abs/2010.13002).
+
+### Examples
+
+- [Script](https://github.com/huggingface/transformers/tree/master/examples/research_projects/seq2seq-distillation/finetune_pegasus_xsum.sh) to fine-tune pegasus
+  on the XSUM dataset. Data download instructions at [examples/pytorch/summarization/](https://github.com/huggingface/transformers/tree/master/examples/pytorch/summarization/README.md).
+- FP16 is not supported (help/ideas on this appreciated!).
+- The adafactor optimizer is recommended for pegasus fine-tuning.
+
+
+## Implementation Notes
+
+- All models are transformer encoder-decoders with 16 layers in each component.
+- The implementation is completely inherited from [`BartForConditionalGeneration`]
+- Some key configuration differences:
+
+  - static, sinusoidal position embeddings
+  - the model starts generating with pad_token_id (which has 0 token_embedding) as the prefix.
+  - more beams are used (`num_beams=8`)
+- All pretrained pegasus checkpoints are the same besides three attributes: `tokenizer.model_max_length` (maximum
+  input size), `max_length` (the maximum number of tokens to generate) and `length_penalty`.
+- The code to convert checkpoints trained in the author's [repo](https://github.com/google-research/pegasus) can be
+  found in `convert_pegasus_tf_to_pytorch.py`.
+
+
+## Usage Example
+
+```python
+>>> from transformers import PegasusForConditionalGeneration, PegasusTokenizer
+>>> import torch
+>>> src_text = [
+...     """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
+>>> ]
+
+>>> model_name = 'google/pegasus-xsum'
+>>> device = 'cuda' if torch.cuda.is_available() else 'cpu'
+>>> tokenizer = PegasusTokenizer.from_pretrained(model_name)
+>>> model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
+>>> batch = tokenizer(src_text, truncation=True, padding='longest', return_tensors="pt").to(device)
+>>> translated = model.generate(**batch)
+>>> tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
+>>> assert tgt_text[0] == "California's largest electricity provider has turned off power to hundreds of thousands of customers."
+```
+
+## PegasusConfig
+
+[[autodoc]] PegasusConfig
+
+## PegasusTokenizer
+
+warning: `add_tokens` does not work at the moment.
+
+[[autodoc]] PegasusTokenizer
+
+## PegasusTokenizerFast
+
+[[autodoc]] PegasusTokenizerFast
+
+## PegasusModel
+
+[[autodoc]] PegasusModel
+    - forward
+
+## PegasusForConditionalGeneration
+
+[[autodoc]] PegasusForConditionalGeneration
+    - forward
+
+## PegasusForCausalLM
+
+[[autodoc]] PegasusForCausalLM
+    - forward
+
+## TFPegasusModel
+
+[[autodoc]] TFPegasusModel
+    - call
+
+## TFPegasusForConditionalGeneration
+
+[[autodoc]] TFPegasusForConditionalGeneration
+    - call
+
+## FlaxPegasusModel
+
+[[autodoc]] FlaxPegasusModel
+    - __call__
+    - encode
+    - decode
+
+## FlaxPegasusForConditionalGeneration
+
+[[autodoc]] FlaxPegasusForConditionalGeneration
+    - __call__
+    - encode
+    - decode
diff --git a/docs/source/model_doc/pegasus.rst b/docs/source/model_doc/pegasus.rst
deleted file mode 100644
index fe05fb1de7..0000000000
--- a/docs/source/model_doc/pegasus.rst
+++ /dev/null
@@ -1,168 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Pegasus
------------------------------------------------------------------------------------------------------------------------
-
-**DISCLAIMER:** If you see something strange, file a `Github Issue
-<https://github.com/huggingface/transformers/issues/new?assignees=sshleifer&labels=&template=bug-report.md&title>`__
-and assign @patrickvonplaten.
-
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The Pegasus model was proposed in `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization
-<https://arxiv.org/pdf/1912.08777.pdf>`__ by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu on Dec 18, 2019.
-
-According to the abstract,
-
-- Pegasus' pretraining task is intentionally similar to summarization: important sentences are removed/masked from an
-  input document and are generated together as one output sequence from the remaining sentences, similar to an
-  extractive summary.
-- Pegasus achieves SOTA summarization performance on all 12 downstream tasks, as measured by ROUGE and human eval.
-
-This model was contributed by `sshleifer <https://huggingface.co/sshleifer>`__. The Authors' code can be found `here
-<https://github.com/google-research/pegasus>`__.
-
-
-Checkpoints
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-All the `checkpoints <https://huggingface.co/models?search=pegasus>`__ are fine-tuned for summarization, besides
-`pegasus-large`, whence the other checkpoints are fine-tuned:
-
-- Each checkpoint is 2.2 GB on disk and 568M parameters.
-- FP16 is not supported (help/ideas on this appreciated!).
-- Summarizing xsum in fp32 takes about 400ms/sample, with default parameters on a v100 GPU.
-- Full replication results and correctly pre-processed data can be found in this `Issue
-  <https://github.com/huggingface/transformers/issues/6844#issue-689259666>`__.
-- `Distilled checkpoints <https://huggingface.co/models?search=distill-pegasus>`__ are described in this `paper
-  <https://arxiv.org/abs/2010.13002>`__.
-
-Examples
-_______________________________________________________________________________________________________________________
-
-- :prefix_link:`Script <examples/research_projects/seq2seq-distillation/finetune_pegasus_xsum.sh>` to fine-tune pegasus
-  on the XSUM dataset. Data download instructions at :prefix_link:`examples/pytorch/summarization/
-  <examples/pytorch/summarization/README.md>`.
-- FP16 is not supported (help/ideas on this appreciated!).
-- The adafactor optimizer is recommended for pegasus fine-tuning.
-
-
-Implementation Notes
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-- All models are transformer encoder-decoders with 16 layers in each component.
-- The implementation is completely inherited from :class:`~transformers.BartForConditionalGeneration`
-- Some key configuration differences:
-
-    - static, sinusoidal position embeddings
-    - the model starts generating with pad_token_id (which has 0 token_embedding) as the prefix.
-    - more beams are used (:obj:`num_beams=8`)
-- All pretrained pegasus checkpoints are the same besides three attributes: :obj:`tokenizer.model_max_length` (maximum
-  input size), :obj:`max_length` (the maximum number of tokens to generate) and :obj:`length_penalty`.
-- The code to convert checkpoints trained in the author's `repo <https://github.com/google-research/pegasus>`_ can be
-  found in ``convert_pegasus_tf_to_pytorch.py``.
-
-
-Usage Example
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. code-block:: python
-
-    >>> from transformers import PegasusForConditionalGeneration, PegasusTokenizer
-    >>> import torch
-    >>> src_text = [
-    ...     """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
-    >>> ]
-
-    >>> model_name = 'google/pegasus-xsum'
-    >>> device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    >>> tokenizer = PegasusTokenizer.from_pretrained(model_name)
-    >>> model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
-    >>> batch = tokenizer(src_text, truncation=True, padding='longest', return_tensors="pt").to(device)
-    >>> translated = model.generate(**batch)
-    >>> tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
-    >>> assert tgt_text[0] == "California's largest electricity provider has turned off power to hundreds of thousands of customers."
-
-
-
-PegasusConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.PegasusConfig
-
-
-PegasusTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-warning: ``add_tokens`` does not work at the moment.
-
-.. autoclass:: transformers.PegasusTokenizer
-    :members:
-
-
-PegasusTokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.PegasusTokenizerFast
-    :members:
-
-
-PegasusModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.PegasusModel
-    :members: forward
-
-
-PegasusForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.PegasusForConditionalGeneration
-    :members: forward
-
-
-PegasusForCausalLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.PegasusForCausalLM
-    :members: forward
-
-
-TFPegasusModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFPegasusModel
-    :members: call
-
-
-TFPegasusForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFPegasusForConditionalGeneration
-    :members: call
-
-
-FlaxPegasusModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxPegasusModel
-    :members: __call__, encode, decode
-
-
-FlaxPegasusForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxPegasusForConditionalGeneration
-    :members: __call__, encode, decode
diff --git a/docs/source/model_doc/phobert.mdx b/docs/source/model_doc/phobert.mdx
new file mode 100644
index 0000000000..4ae9b0aa62
--- /dev/null
+++ b/docs/source/model_doc/phobert.mdx
@@ -0,0 +1,53 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# PhoBERT
+
+## Overview
+
+The PhoBERT model was proposed in [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92.pdf) by Dat Quoc Nguyen, Anh Tuan Nguyen.
+
+The abstract from the paper is the following:
+
+*We present PhoBERT with two versions, PhoBERT-base and PhoBERT-large, the first public large-scale monolingual
+language models pre-trained for Vietnamese. Experimental results show that PhoBERT consistently outperforms the recent
+best pre-trained multilingual model XLM-R (Conneau et al., 2020) and improves the state-of-the-art in multiple
+Vietnamese-specific NLP tasks including Part-of-speech tagging, Dependency parsing, Named-entity recognition and
+Natural language inference.*
+
+Example of use:
+
+```python
+>>> import torch
+>>> from transformers import AutoModel, AutoTokenizer
+
+>>> phobert = AutoModel.from_pretrained("vinai/phobert-base")
+>>> tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
+
+>>> # INPUT TEXT MUST BE ALREADY WORD-SEGMENTED!
+>>> line = "Tôi là sinh_viên trường đại_học Công_nghệ ."
+
+>>> input_ids = torch.tensor([tokenizer.encode(line)])
+
+>>> with torch.no_grad():
+...     features = phobert(input_ids)  # Models outputs are now tuples
+
+>>> # With TensorFlow 2.0+:
+>>> # from transformers import TFAutoModel
+>>> # phobert = TFAutoModel.from_pretrained("vinai/phobert-base")
+```
+
+This model was contributed by [dqnguyen](https://huggingface.co/dqnguyen). The original code can be found [here](https://github.com/VinAIResearch/PhoBERT).
+
+## PhobertTokenizer
+
+[[autodoc]] PhobertTokenizer
diff --git a/docs/source/model_doc/phobert.rst b/docs/source/model_doc/phobert.rst
deleted file mode 100644
index ffe8657ab3..0000000000
--- a/docs/source/model_doc/phobert.rst
+++ /dev/null
@@ -1,60 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-PhoBERT
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The PhoBERT model was proposed in `PhoBERT: Pre-trained language models for Vietnamese
-<https://www.aclweb.org/anthology/2020.findings-emnlp.92.pdf>`__ by Dat Quoc Nguyen, Anh Tuan Nguyen.
-
-The abstract from the paper is the following:
-
-*We present PhoBERT with two versions, PhoBERT-base and PhoBERT-large, the first public large-scale monolingual
-language models pre-trained for Vietnamese. Experimental results show that PhoBERT consistently outperforms the recent
-best pre-trained multilingual model XLM-R (Conneau et al., 2020) and improves the state-of-the-art in multiple
-Vietnamese-specific NLP tasks including Part-of-speech tagging, Dependency parsing, Named-entity recognition and
-Natural language inference.*
-
-Example of use:
-
-.. code-block::
-
-    >>> import torch
-    >>> from transformers import AutoModel, AutoTokenizer
-
-    >>> phobert = AutoModel.from_pretrained("vinai/phobert-base")
-    >>> tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
-
-    >>> # INPUT TEXT MUST BE ALREADY WORD-SEGMENTED!
-    >>> line = "Tôi là sinh_viên trường đại_học Công_nghệ ."
-
-    >>> input_ids = torch.tensor([tokenizer.encode(line)])
-
-    >>> with torch.no_grad():
-    ...     features = phobert(input_ids)  # Models outputs are now tuples
-
-    >>> # With TensorFlow 2.0+:
-    >>> # from transformers import TFAutoModel
-    >>> # phobert = TFAutoModel.from_pretrained("vinai/phobert-base")
-
-
-This model was contributed by `dqnguyen <https://huggingface.co/dqnguyen>`__. The original code can be found `here
-<https://github.com/VinAIResearch/PhoBERT>`__.
-
-PhobertTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.PhobertTokenizer
-    :members: 
diff --git a/docs/source/model_doc/prophetnet.mdx b/docs/source/model_doc/prophetnet.mdx
new file mode 100644
index 0000000000..951bbc5b96
--- /dev/null
+++ b/docs/source/model_doc/prophetnet.mdx
@@ -0,0 +1,82 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# ProphetNet
+
+**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign
+@patrickvonplaten
+
+## Overview
+
+The ProphetNet model was proposed in [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training,](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei
+Zhang, Ming Zhou on 13 Jan, 2020.
+
+ProphetNet is an encoder-decoder model and can predict n-future tokens for "ngram" language modeling instead of just
+the next token.
+
+The abstract from the paper is the following:
+
+*In this paper, we present a new sequence-to-sequence pretraining model called ProphetNet, which introduces a novel
+self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of
+the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by
+n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time
+step. The future n-gram prediction explicitly encourages the model to plan for the future tokens and prevent
+overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale
+dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for
+abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new
+state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus.*
+
+The Authors' code can be found [here](https://github.com/microsoft/ProphetNet).
+
+
+## ProphetNetConfig
+
+[[autodoc]] ProphetNetConfig
+
+## ProphetNetTokenizer
+
+[[autodoc]] ProphetNetTokenizer
+
+## ProphetNet specific outputs
+
+[[autodoc]] models.prophetnet.modeling_prophetnet.ProphetNetSeq2SeqLMOutput
+
+[[autodoc]] models.prophetnet.modeling_prophetnet.ProphetNetSeq2SeqModelOutput
+
+[[autodoc]] models.prophetnet.modeling_prophetnet.ProphetNetDecoderModelOutput
+
+[[autodoc]] models.prophetnet.modeling_prophetnet.ProphetNetDecoderLMOutput
+
+## ProphetNetModel
+
+[[autodoc]] ProphetNetModel
+    - forward
+
+## ProphetNetEncoder
+
+[[autodoc]] ProphetNetEncoder
+    - forward
+
+## ProphetNetDecoder
+
+[[autodoc]] ProphetNetDecoder
+    - forward
+
+## ProphetNetForConditionalGeneration
+
+[[autodoc]] ProphetNetForConditionalGeneration
+    - forward
+
+## ProphetNetForCausalLM
+
+[[autodoc]] ProphetNetForCausalLM
+    - forward
diff --git a/docs/source/model_doc/prophetnet.rst b/docs/source/model_doc/prophetnet.rst
deleted file mode 100644
index a1e0e75e7b..0000000000
--- a/docs/source/model_doc/prophetnet.rst
+++ /dev/null
@@ -1,106 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-ProphetNet
------------------------------------------------------------------------------------------------------------------------
-
-**DISCLAIMER:** If you see something strange, file a `Github Issue
-<https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
-@patrickvonplaten
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The ProphetNet model was proposed in `ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training,
-<https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei
-Zhang, Ming Zhou on 13 Jan, 2020.
-
-ProphetNet is an encoder-decoder model and can predict n-future tokens for "ngram" language modeling instead of just
-the next token.
-
-The abstract from the paper is the following:
-
-*In this paper, we present a new sequence-to-sequence pretraining model called ProphetNet, which introduces a novel
-self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of
-the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by
-n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time
-step. The future n-gram prediction explicitly encourages the model to plan for the future tokens and prevent
-overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale
-dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for
-abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new
-state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus.*
-
-The Authors' code can be found `here <https://github.com/microsoft/ProphetNet>`__.
-
-
-ProphetNetConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ProphetNetConfig
-    :members:
-
-
-ProphetNetTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ProphetNetTokenizer
-    :members:
-
-
-ProphetNet specific outputs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.models.prophetnet.modeling_prophetnet.ProphetNetSeq2SeqLMOutput
-    :members:
-
-.. autoclass:: transformers.models.prophetnet.modeling_prophetnet.ProphetNetSeq2SeqModelOutput
-    :members:
-
-.. autoclass:: transformers.models.prophetnet.modeling_prophetnet.ProphetNetDecoderModelOutput
-    :members:
-
-.. autoclass:: transformers.models.prophetnet.modeling_prophetnet.ProphetNetDecoderLMOutput
-    :members:
-
-ProphetNetModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ProphetNetModel
-    :members: forward
-
-
-ProphetNetEncoder
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ProphetNetEncoder
-    :members: forward
-
-
-ProphetNetDecoder
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ProphetNetDecoder
-    :members: forward
-
-
-ProphetNetForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ProphetNetForConditionalGeneration
-    :members: forward
-
-
-ProphetNetForCausalLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ProphetNetForCausalLM
-    :members: forward
diff --git a/docs/source/model_doc/qdqbert.mdx b/docs/source/model_doc/qdqbert.mdx
new file mode 100644
index 0000000000..210fa10892
--- /dev/null
+++ b/docs/source/model_doc/qdqbert.mdx
@@ -0,0 +1,158 @@
+<!--Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# QDQBERT
+
+## Overview
+
+The QDQBERT model can be referenced in [Integer Quantization for Deep Learning Inference: Principles and Empirical
+Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius
+Micikevicius.
+
+The abstract from the paper is the following:
+
+*Quantization techniques can reduce the size of Deep Neural Networks and improve inference latency and throughput by
+taking advantage of high throughput integer instructions. In this paper we review the mathematical aspects of
+quantization parameters and evaluate their choices on a wide range of neural network models for different application
+domains, including vision, speech, and language. We focus on quantization techniques that are amenable to acceleration
+by processors with high-throughput integer math pipelines. We also present a workflow for 8-bit quantization that is
+able to maintain accuracy within 1% of the floating-point baseline on all networks studied, including models that are
+more difficult to quantize, such as MobileNets and BERT-large.*
+
+Tips:
+
+- QDQBERT model adds fake quantization operations (pair of QuantizeLinear/DequantizeLinear ops) to (i) linear layer
+  inputs and weights, (ii) matmul inputs, (iii) residual add inputs, in BERT model.
+
+- QDQBERT requires the dependency of [Pytorch Quantization Toolkit](https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization). To install `pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com`
+
+- QDQBERT model can be loaded from any checkpoint of HuggingFace BERT model (for example *bert-base-uncased*), and
+  perform Quantization Aware Training/Post Training Quantization.
+
+- A complete example of using QDQBERT model to perform Quatization Aware Training and Post Training Quantization for
+  SQUAD task can be found at [transformers/examples/research_projects/quantization-qdqbert/](examples/research_projects/quantization-qdqbert/).
+
+This model was contributed by [shangz](https://huggingface.co/shangz).
+
+
+### Set default quantizers
+
+QDQBERT model adds fake quantization operations (pair of QuantizeLinear/DequantizeLinear ops) to BERT by
+`TensorQuantizer` in [Pytorch Quantization Toolkit](https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization). `TensorQuantizer` is the module
+for quantizing tensors, with `QuantDescriptor` defining how the tensor should be quantized. Refer to [Pytorch
+Quantization Toolkit userguide](https://docs.nvidia.com/deeplearning/tensorrt/pytorch-quantization-toolkit/docs/userguide.html) for more details.
+
+Before creating QDQBERT model, one has to set the default `QuantDescriptor` defining default tensor quantizers.
+
+Example:
+
+```python
+>>> import pytorch_quantization.nn as quant_nn
+>>> from pytorch_quantization.tensor_quant import QuantDescriptor
+
+>>> # The default tensor quantizer is set to use Max calibration method
+>>> input_desc = QuantDescriptor(num_bits=8, calib_method="max")
+>>> # The default tensor quantizer is set to be per-channel quantization for weights
+>>> weight_desc = QuantDescriptor(num_bits=8, axis=((0,)))
+>>> quant_nn.QuantLinear.set_default_quant_desc_input(input_desc)
+>>> quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)
+```
+
+### Calibration
+
+Calibration is the terminology of passing data samples to the quantizer and deciding the best scaling factors for
+tensors. After setting up the tensor quantizers, one can use the following example to calibrate the model:
+
+```python
+>>> # Find the TensorQuantizer and enable calibration
+>>> for name, module in model.named_modules():
+>>>     if name.endswith('_input_quantizer'):
+>>>         module.enable_calib()
+>>>         module.disable_quant()  # Use full precision data to calibrate
+
+>>> # Feeding data samples
+>>> model(x)
+>>> # ...
+
+>>> # Finalize calibration
+>>> for name, module in model.named_modules():
+>>>     if name.endswith('_input_quantizer'):
+>>>         module.load_calib_amax()
+>>>         module.enable_quant()
+
+>>> # If running on GPU, it needs to call .cuda() again because new tensors will be created by calibration process
+>>> model.cuda()
+
+>>> # Keep running the quantized model
+>>> # ...
+```
+
+### Export to ONNX
+
+The goal of exporting to ONNX is to deploy inference by [TensorRT](https://developer.nvidia.com/tensorrt). Fake
+quantization will be broken into a pair of QuantizeLinear/DequantizeLinear ONNX ops. After setting static member of
+TensorQuantizer to use Pytorch’s own fake quantization functions, fake quantized model can be exported to ONNX, follow
+the instructions in [torch.onnx](https://pytorch.org/docs/stable/onnx.html). Example:
+
+```python
+>>> from pytorch_quantization.nn import TensorQuantizer
+>>> TensorQuantizer.use_fb_fake_quant = True
+
+>>> # Load the calibrated model
+>>> ...
+>>> # ONNX export
+>>> torch.onnx.export(...)
+```
+
+## QDQBertConfig
+
+[[autodoc]] QDQBertConfig
+
+## QDQBertModel
+
+[[autodoc]] QDQBertModel
+    - forward
+
+## QDQBertLMHeadModel
+
+[[autodoc]] QDQBertLMHeadModel
+    - forward
+
+## QDQBertForMaskedLM
+
+[[autodoc]] QDQBertForMaskedLM
+    - forward
+
+## QDQBertForSequenceClassification
+
+[[autodoc]] QDQBertForSequenceClassification
+    - forward
+
+## QDQBertForNextSentencePrediction
+
+[[autodoc]] QDQBertForNextSentencePrediction
+    - forward
+
+## QDQBertForMultipleChoice
+
+[[autodoc]] QDQBertForMultipleChoice
+    - forward
+
+## QDQBertForTokenClassification
+
+[[autodoc]] QDQBertForTokenClassification
+    - forward
+
+## QDQBertForQuestionAnswering
+
+[[autodoc]] QDQBertForQuestionAnswering
+    - forward
diff --git a/docs/source/model_doc/qdqbert.rst b/docs/source/model_doc/qdqbert.rst
deleted file mode 100644
index fa54c2b849..0000000000
--- a/docs/source/model_doc/qdqbert.rst
+++ /dev/null
@@ -1,189 +0,0 @@
-.. 
-    Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-QDQBERT
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The QDQBERT model can be referenced in `Integer Quantization for Deep Learning Inference: Principles and Empirical
-Evaluation <https://arxiv.org/abs/2004.09602>`__ by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius
-Micikevicius.
-
-The abstract from the paper is the following:
-
-*Quantization techniques can reduce the size of Deep Neural Networks and improve inference latency and throughput by
-taking advantage of high throughput integer instructions. In this paper we review the mathematical aspects of
-quantization parameters and evaluate their choices on a wide range of neural network models for different application
-domains, including vision, speech, and language. We focus on quantization techniques that are amenable to acceleration
-by processors with high-throughput integer math pipelines. We also present a workflow for 8-bit quantization that is
-able to maintain accuracy within 1% of the floating-point baseline on all networks studied, including models that are
-more difficult to quantize, such as MobileNets and BERT-large.*
-
-Tips:
-
-- QDQBERT model adds fake quantization operations (pair of QuantizeLinear/DequantizeLinear ops) to (i) linear layer
-  inputs and weights, (ii) matmul inputs, (iii) residual add inputs, in BERT model.
-
-- QDQBERT requires the dependency of `Pytorch Quantization Toolkit
-  <https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization>`__. To install ``pip install
-  pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com``
-
-- QDQBERT model can be loaded from any checkpoint of HuggingFace BERT model (for example *bert-base-uncased*), and
-  perform Quantization Aware Training/Post Training Quantization.
-
-- A complete example of using QDQBERT model to perform Quatization Aware Training and Post Training Quantization for
-  SQUAD task can be found at `transformers/examples/research_projects/quantization-qdqbert/
-  </examples/research_projects/quantization-qdqbert/>`_.
-
-This model was contributed by `shangz <https://huggingface.co/shangz>`__.
-
-
-Set default quantizers
-_______________________________________________________________________________________________________________________
-
-QDQBERT model adds fake quantization operations (pair of QuantizeLinear/DequantizeLinear ops) to BERT by
-:obj:`TensorQuantizer` in `Pytorch Quantization Toolkit
-<https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization>`__. :obj:`TensorQuantizer` is the module
-for quantizing tensors, with :obj:`QuantDescriptor` defining how the tensor should be quantized. Refer to `Pytorch
-Quantization Toolkit userguide
-<https://docs.nvidia.com/deeplearning/tensorrt/pytorch-quantization-toolkit/docs/userguide.html>`__ for more details.
-
-Before creating QDQBERT model, one has to set the default :obj:`QuantDescriptor` defining default tensor quantizers.
-Example:
-
-.. code-block::
-
-    >>> import pytorch_quantization.nn as quant_nn
-    >>> from pytorch_quantization.tensor_quant import QuantDescriptor
-
-    >>> # The default tensor quantizer is set to use Max calibration method
-    >>> input_desc = QuantDescriptor(num_bits=8, calib_method="max")
-    >>> # The default tensor quantizer is set to be per-channel quantization for weights
-    >>> weight_desc = QuantDescriptor(num_bits=8, axis=((0,)))
-    >>> quant_nn.QuantLinear.set_default_quant_desc_input(input_desc)
-    >>> quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)
-
-
-Calibration
-_______________________________________________________________________________________________________________________
-
-Calibration is the terminology of passing data samples to the quantizer and deciding the best scaling factors for
-tensors. After setting up the tensor quantizers, one can use the following example to calibrate the model:
-
-.. code-block::
-
-    >>> # Find the TensorQuantizer and enable calibration
-    >>> for name, module in model.named_modules():
-    >>>     if name.endswith('_input_quantizer'):
-    >>>         module.enable_calib()
-    >>>         module.disable_quant()  # Use full precision data to calibrate
-
-    >>> # Feeding data samples
-    >>> model(x)
-    >>> # ...
-
-    >>> # Finalize calibration
-    >>> for name, module in model.named_modules():
-    >>>     if name.endswith('_input_quantizer'):
-    >>>         module.load_calib_amax()
-    >>>         module.enable_quant()
-
-    >>> # If running on GPU, it needs to call .cuda() again because new tensors will be created by calibration process
-    >>> model.cuda()
-
-    >>> # Keep running the quantized model
-    >>> # ...
-
-
-Export to ONNX
-_______________________________________________________________________________________________________________________
-
-The goal of exporting to ONNX is to deploy inference by `TensorRT <https://developer.nvidia.com/tensorrt>`__. Fake
-quantization will be broken into a pair of QuantizeLinear/DequantizeLinear ONNX ops. After setting static member of
-TensorQuantizer to use Pytorch’s own fake quantization functions, fake quantized model can be exported to ONNX, follow
-the instructions in `torch.onnx <https://pytorch.org/docs/stable/onnx.html>`__. Example:
-
-.. code-block::
-
-    >>> from pytorch_quantization.nn import TensorQuantizer
-    >>> TensorQuantizer.use_fb_fake_quant = True
-
-    >>> # Load the calibrated model
-    >>> ...
-    >>> # ONNX export
-    >>> torch.onnx.export(...)
-
-
-QDQBertConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.QDQBertConfig
-    :members:
-
-
-QDQBertModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.QDQBertModel
-    :members: forward
-
-
-QDQBertLMHeadModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.QDQBertLMHeadModel
-    :members: forward
-
-
-QDQBertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.QDQBertForMaskedLM
-    :members: forward
-
-
-QDQBertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.QDQBertForSequenceClassification
-    :members: forward
-
-
-QDQBertForNextSentencePrediction
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.QDQBertForNextSentencePrediction
-    :members: forward
-
-
-QDQBertForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.QDQBertForMultipleChoice
-    :members: forward
-
-
-QDQBertForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.QDQBertForTokenClassification
-    :members: forward
-
-
-QDQBertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.QDQBertForQuestionAnswering
-    :members: forward
-
diff --git a/docs/source/model_doc/rag.mdx b/docs/source/model_doc/rag.mdx
new file mode 100644
index 0000000000..2f5d3498d8
--- /dev/null
+++ b/docs/source/model_doc/rag.mdx
@@ -0,0 +1,96 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# RAG
+
+## Overview
+
+Retrieval-augmented generation ("RAG") models combine the powers of pretrained dense retrieval (DPR) and
+sequence-to-sequence models. RAG models retrieve documents, pass them to a seq2seq model, then marginalize to generate
+outputs. The retriever and seq2seq modules are initialized from pretrained models, and fine-tuned jointly, allowing
+both retrieval and generation to adapt to downstream tasks.
+
+It is based on the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir
+Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
+
+The abstract from the paper is the following:
+
+*Large pre-trained language models have been shown to store factual knowledge in their parameters, and achieve
+state-of-the-art results when fine-tuned on downstream NLP tasks. However, their ability to access and precisely
+manipulate knowledge is still limited, and hence on knowledge-intensive tasks, their performance lags behind
+task-specific architectures. Additionally, providing provenance for their decisions and updating their world knowledge
+remain open research problems. Pre-trained models with a differentiable access mechanism to explicit nonparametric
+memory can overcome this issue, but have so far been only investigated for extractive downstream tasks. We explore a
+general-purpose fine-tuning recipe for retrieval-augmented generation (RAG) — models which combine pre-trained
+parametric and non-parametric memory for language generation. We introduce RAG models where the parametric memory is a
+pre-trained seq2seq model and the non-parametric memory is a dense vector index of Wikipedia, accessed with a
+pre-trained neural retriever. We compare two RAG formulations, one which conditions on the same retrieved passages
+across the whole generated sequence, the other can use different passages per token. We fine-tune and evaluate our
+models on a wide range of knowledge-intensive NLP tasks and set the state-of-the-art on three open domain QA tasks,
+outperforming parametric seq2seq models and task-specific retrieve-and-extract architectures. For language generation
+tasks, we find that RAG models generate more specific, diverse and factual language than a state-of-the-art
+parametric-only seq2seq baseline.*
+
+This model was contributed by [ola13](https://huggingface.co/ola13).
+
+
+## RagConfig
+
+[[autodoc]] RagConfig
+
+## RagTokenizer
+
+[[autodoc]] RagTokenizer
+
+## Rag specific outputs
+
+[[autodoc]] models.rag.modeling_rag.RetrievAugLMMarginOutput
+
+[[autodoc]] models.rag.modeling_rag.RetrievAugLMOutput
+
+## RagRetriever
+
+[[autodoc]] RagRetriever
+
+## RagModel
+
+[[autodoc]] RagModel
+    - forward
+
+## RagSequenceForGeneration
+
+[[autodoc]] RagSequenceForGeneration
+    - forward
+    - generate
+
+## RagTokenForGeneration
+
+[[autodoc]] RagTokenForGeneration
+    - forward
+    - generate
+
+## TFRagModel
+
+[[autodoc]] TFRagModel
+    - call
+
+## TFRagSequenceForGeneration
+
+[[autodoc]] TFRagSequenceForGeneration
+    - call
+    - generate
+
+## TFRagTokenForGeneration
+
+[[autodoc]] TFRagTokenForGeneration
+    - call
+    - generate
diff --git a/docs/source/model_doc/rag.rst b/docs/source/model_doc/rag.rst
deleted file mode 100644
index 62acc18e8f..0000000000
--- a/docs/source/model_doc/rag.rst
+++ /dev/null
@@ -1,118 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-RAG
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Retrieval-augmented generation ("RAG") models combine the powers of pretrained dense retrieval (DPR) and
-sequence-to-sequence models. RAG models retrieve documents, pass them to a seq2seq model, then marginalize to generate
-outputs. The retriever and seq2seq modules are initialized from pretrained models, and fine-tuned jointly, allowing
-both retrieval and generation to adapt to downstream tasks.
-
-It is based on the paper `Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks
-<https://arxiv.org/abs/2005.11401>`__ by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir
-Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
-
-The abstract from the paper is the following:
-
-*Large pre-trained language models have been shown to store factual knowledge in their parameters, and achieve
-state-of-the-art results when fine-tuned on downstream NLP tasks. However, their ability to access and precisely
-manipulate knowledge is still limited, and hence on knowledge-intensive tasks, their performance lags behind
-task-specific architectures. Additionally, providing provenance for their decisions and updating their world knowledge
-remain open research problems. Pre-trained models with a differentiable access mechanism to explicit nonparametric
-memory can overcome this issue, but have so far been only investigated for extractive downstream tasks. We explore a
-general-purpose fine-tuning recipe for retrieval-augmented generation (RAG) — models which combine pre-trained
-parametric and non-parametric memory for language generation. We introduce RAG models where the parametric memory is a
-pre-trained seq2seq model and the non-parametric memory is a dense vector index of Wikipedia, accessed with a
-pre-trained neural retriever. We compare two RAG formulations, one which conditions on the same retrieved passages
-across the whole generated sequence, the other can use different passages per token. We fine-tune and evaluate our
-models on a wide range of knowledge-intensive NLP tasks and set the state-of-the-art on three open domain QA tasks,
-outperforming parametric seq2seq models and task-specific retrieve-and-extract architectures. For language generation
-tasks, we find that RAG models generate more specific, diverse and factual language than a state-of-the-art
-parametric-only seq2seq baseline.*
-
-This model was contributed by `ola13 <https://huggingface.co/ola13>`__.
-
-
-RagConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RagConfig
-    :members:
-
-
-RagTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RagTokenizer
-    :members:
-
-
-Rag specific outputs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.models.rag.modeling_rag.RetrievAugLMMarginOutput
-    :members:
-
-.. autoclass:: transformers.models.rag.modeling_rag.RetrievAugLMOutput
-    :members:
-
-RagRetriever
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RagRetriever
-    :members:
-
-
-RagModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RagModel
-    :members: forward
-
-
-RagSequenceForGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RagSequenceForGeneration
-    :members: forward, generate
-
-
-RagTokenForGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RagTokenForGeneration
-    :members: forward, generate
-
-
-TFRagModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRagModel
-    :members: call
-
-
-TFRagSequenceForGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRagSequenceForGeneration
-    :members: call, generate
-
-
-TFRagTokenForGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRagTokenForGeneration
-    :members: call, generate
diff --git a/docs/source/model_doc/reformer.mdx b/docs/source/model_doc/reformer.mdx
new file mode 100644
index 0000000000..24c8375f60
--- /dev/null
+++ b/docs/source/model_doc/reformer.mdx
@@ -0,0 +1,177 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Reformer
+
+**DISCLAIMER:** This model is still a work in progress, if you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title).
+
+## Overview
+
+The Reformer model was proposed in the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451.pdf) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+
+The abstract from the paper is the following:
+
+*Large Transformer models routinely achieve state-of-the-art results on a number of tasks but training these models can
+be prohibitively costly, especially on long sequences. We introduce two techniques to improve the efficiency of
+Transformers. For one, we replace dot-product attention by one that uses locality-sensitive hashing, changing its
+complexity from O(L^2) to O(Llog(L)), where L is the length of the sequence. Furthermore, we use reversible residual
+layers instead of the standard residuals, which allows storing activations only once in the training process instead of
+N times, where N is the number of layers. The resulting model, the Reformer, performs on par with Transformer models
+while being much more memory-efficient and much faster on long sequences.*
+
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be
+found [here](https://github.com/google/trax/tree/master/trax/models/reformer).
+
+**Note**:
+
+- Reformer does **not** work with *torch.nn.DataParallel* due to a bug in PyTorch, see [issue #36035](https://github.com/pytorch/pytorch/issues/36035)
+
+## Axial Positional Encodings
+
+Axial Positional Encodings were first implemented in Google's [trax library](https://github.com/google/trax/blob/4d99ad4965bab1deba227539758d59f0df0fef48/trax/layers/research/position_encodings.py#L29)
+and developed by the authors of this model's paper. In models that are treating very long input sequences, the
+conventional position id encodings store an embedings vector of size \\(d\\) being the `config.hidden_size` for
+every position \\(i, \ldots, n_s\\), with \\(n_s\\) being `config.max_embedding_size`. This means that having
+a sequence length of \\(n_s = 2^{19} \approx 0.5M\\) and a `config.hidden_size` of \\(d = 2^{10} \approx 1000\\)
+would result in a position encoding matrix:
+
+$$X_{i,j}, \text{ with } i \in \left[1,\ldots, d\right] \text{ and } j \in \left[1,\ldots, n_s\right]$$
+
+which alone has over 500M parameters to store. Axial positional encodings factorize \\(X_{i,j}\\) into two matrices:
+
+$$X^{1}_{i,j}, \text{ with } i \in \left[1,\ldots, d^1\right] \text{ and } j \in \left[1,\ldots, n_s^1\right]$$
+
+and
+
+$$X^{2}_{i,j}, \text{ with } i \in \left[1,\ldots, d^2\right] \text{ and } j \in \left[1,\ldots, n_s^2\right]$$
+
+with:
+
+$$d = d^1 + d^2 \text{ and } n_s = n_s^1 \times n_s^2 .$$
+
+Therefore the following holds:
+
+$$X_{i,j} = \begin{cases}
+X^{1}_{i, k}, & \text{if }\ i < d^1 \text{ with } k = j \mod n_s^1 \\
+X^{2}_{i - d^1, l}, & \text{if } i \ge d^1 \text{ with } l = \lfloor\frac{j}{n_s^1}\rfloor
+\end{cases}$$
+
+Intuitively, this means that a position embedding vector \\(x_j \in \mathbb{R}^{d}\\) is now the composition of two
+factorized embedding vectors: \\(x^1_{k, l} + x^2_{l, k}\\), where as the `config.max_embedding_size` dimension
+\\(j\\) is factorized into \\(k \text{ and } l\\). This design ensures that each position embedding vector
+\\(x_j\\) is unique.
+
+Using the above example again, axial position encoding with \\(d^1 = 2^5, d^2 = 2^5, n_s^1 = 2^9, n_s^2 = 2^{10}\\)
+can drastically reduced the number of parameters to \\(2^{14} + 2^{15} \approx 49000\\) parameters.
+
+In practice, the parameter `config.axial_pos_embds_dim` is set to a tuple \\((d^1, d^2)\\) which sum has to be
+equal to `config.hidden_size` and `config.axial_pos_shape` is set to a tuple \\((n_s^1, n_s^2)\\) which
+product has to be equal to `config.max_embedding_size`, which during training has to be equal to the *sequence
+length* of the `input_ids`.
+
+
+## LSH Self Attention
+
+In Locality sensitive hashing (LSH) self attention the key and query projection weights are tied. Therefore, the key
+query embedding vectors are also tied. LSH self attention uses the locality sensitive hashing mechanism proposed in
+[Practical and Optimal LSH for Angular Distance](https://arxiv.org/abs/1509.02897) to assign each of the tied key
+query embedding vectors to one of `config.num_buckets` possible buckets. The premise is that the more "similar"
+key query embedding vectors (in terms of *cosine similarity*) are to each other, the more likely they are assigned to
+the same bucket.
+
+The accuracy of the LSH mechanism can be improved by increasing `config.num_hashes` or directly the argument
+`num_hashes` of the forward function so that the output of the LSH self attention better approximates the output
+of the "normal" full self attention. The buckets are then sorted and chunked into query key embedding vector chunks
+each of length `config.lsh_chunk_length`. For each chunk, the query embedding vectors attend to its key vectors
+(which are tied to themselves) and to the key embedding vectors of `config.lsh_num_chunks_before` previous
+neighboring chunks and `config.lsh_num_chunks_after` following neighboring chunks.
+
+For more information, see the [original Paper](https://arxiv.org/abs/2001.04451) or this great [blog post](https://www.pragmatic.ml/reformer-deep-dive/).
+
+Note that `config.num_buckets` can also be factorized into a list \\((n_{\text{buckets}}^1,
+n_{\text{buckets}}^2)\\). This way instead of assigning the query key embedding vectors to one of \\((1,\ldots,
+n_{\text{buckets}})\\) they are assigned to one of \\((1-1,\ldots, n_{\text{buckets}}^1-1, \ldots,
+1-n_{\text{buckets}}^2, \ldots, n_{\text{buckets}}^1-n_{\text{buckets}}^2)\\). This is crucial for very long sequences to
+save memory.
+
+When training a model from scratch, it is recommended to leave `config.num_buckets=None`, so that depending on the
+sequence length a good value for `num_buckets` is calculated on the fly. This value will then automatically be
+saved in the config and should be reused for inference.
+
+Using LSH self attention, the memory and time complexity of the query-key matmul operation can be reduced from
+\\(\mathcal{O}(n_s \times n_s)\\) to \\(\mathcal{O}(n_s \times \log(n_s))\\), which usually represents the memory
+and time bottleneck in a transformer model, with \\(n_s\\) being the sequence length.
+
+
+## Local Self Attention
+
+Local self attention is essentially a "normal" self attention layer with key, query and value projections, but is
+chunked so that in each chunk of length `config.local_chunk_length` the query embedding vectors only attends to
+the key embedding vectors in its chunk and to the key embedding vectors of `config.local_num_chunks_before`
+previous neighboring chunks and `config.local_num_chunks_after` following neighboring chunks.
+
+Using Local self attention, the memory and time complexity of the query-key matmul operation can be reduced from
+\\(\mathcal{O}(n_s \times n_s)\\) to \\(\mathcal{O}(n_s \times \log(n_s))\\), which usually represents the memory
+and time bottleneck in a transformer model, with \\(n_s\\) being the sequence length.
+
+
+## Training
+
+During training, we must ensure that the sequence length is set to a value that can be divided by the least common
+multiple of `config.lsh_chunk_length` and `config.local_chunk_length` and that the parameters of the Axial
+Positional Encodings are correctly set as described above. Reformer is very memory efficient so that the model can
+easily be trained on sequences as long as 64000 tokens.
+
+For training, the [`ReformerModelWithLMHead`] should be used as follows:
+
+```python
+input_ids = tokenizer.encode('This is a sentence from the training data', return_tensors='pt')
+loss = model(input_ids, labels=input_ids)[0]
+```
+
+## ReformerConfig
+
+[[autodoc]] ReformerConfig
+
+## ReformerTokenizer
+
+[[autodoc]] ReformerTokenizer
+    - save_vocabulary
+
+## ReformerTokenizerFast
+
+[[autodoc]] ReformerTokenizerFast
+
+## ReformerModel
+
+[[autodoc]] ReformerModel
+    - forward
+
+## ReformerModelWithLMHead
+
+[[autodoc]] ReformerModelWithLMHead
+    - forward
+
+## ReformerForMaskedLM
+
+[[autodoc]] ReformerForMaskedLM
+    - forward
+
+## ReformerForSequenceClassification
+
+[[autodoc]] ReformerForSequenceClassification
+    - forward
+
+## ReformerForQuestionAnswering
+
+[[autodoc]] ReformerForQuestionAnswering
+    - forward
diff --git a/docs/source/model_doc/reformer.rst b/docs/source/model_doc/reformer.rst
deleted file mode 100644
index 6842884d76..0000000000
--- a/docs/source/model_doc/reformer.rst
+++ /dev/null
@@ -1,211 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Reformer
------------------------------------------------------------------------------------------------------------------------
-
-**DISCLAIMER:** This model is still a work in progress, if you see something strange, file a `Github Issue
-<https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__.
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The Reformer model was proposed in the paper `Reformer: The Efficient Transformer
-<https://arxiv.org/abs/2001.04451.pdf>`__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-
-The abstract from the paper is the following:
-
-*Large Transformer models routinely achieve state-of-the-art results on a number of tasks but training these models can
-be prohibitively costly, especially on long sequences. We introduce two techniques to improve the efficiency of
-Transformers. For one, we replace dot-product attention by one that uses locality-sensitive hashing, changing its
-complexity from O(L^2) to O(Llog(L)), where L is the length of the sequence. Furthermore, we use reversible residual
-layers instead of the standard residuals, which allows storing activations only once in the training process instead of
-N times, where N is the number of layers. The resulting model, the Reformer, performs on par with Transformer models
-while being much more memory-efficient and much faster on long sequences.*
-
-This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The Authors' code can be
-found `here <https://github.com/google/trax/tree/master/trax/models/reformer>`__.
-
-**Note**:
-
-- Reformer does **not** work with `torch.nn.DataParallel` due to a bug in PyTorch, see `issue #36035
-  <https://github.com/pytorch/pytorch/issues/36035>`__
-
-Axial Positional Encodings
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Axial Positional Encodings were first implemented in Google's `trax library
-<https://github.com/google/trax/blob/4d99ad4965bab1deba227539758d59f0df0fef48/trax/layers/research/position_encodings.py#L29>`__
-and developed by the authors of this model's paper. In models that are treating very long input sequences, the
-conventional position id encodings store an embedings vector of size :math:`d` being the :obj:`config.hidden_size` for
-every position :math:`i, \ldots, n_s`, with :math:`n_s` being :obj:`config.max_embedding_size`. This means that having
-a sequence length of :math:`n_s = 2^{19} \approx 0.5M` and a ``config.hidden_size`` of :math:`d = 2^{10} \approx 1000`
-would result in a position encoding matrix:
-
-.. math::
-    X_{i,j}, \text{ with } i \in \left[1,\ldots, d\right] \text{ and } j \in \left[1,\ldots, n_s\right] 
-
-which alone has over 500M parameters to store. Axial positional encodings factorize :math:`X_{i,j}` into two matrices:
-
-.. math::
-    X^{1}_{i,j}, \text{ with } i \in \left[1,\ldots, d^1\right] \text{ and } j \in \left[1,\ldots, n_s^1\right] 
-
-and
-
-.. math::
-    X^{2}_{i,j}, \text{ with } i \in \left[1,\ldots, d^2\right] \text{ and } j \in \left[1,\ldots, n_s^2\right] 
-
-with:
-
-.. math::
-    d = d^1 + d^2 \text{ and } n_s = n_s^1 \times n_s^2 .
-
-Therefore the following holds:
-
-.. math::
-    X_{i,j} = \begin{cases}
-                X^{1}_{i, k}, & \text{if }\ i < d^1 \text{ with } k = j \mod n_s^1 \\
-                X^{2}_{i - d^1, l}, & \text{if } i \ge d^1 \text{ with } l = \lfloor\frac{j}{n_s^1}\rfloor
-              \end{cases}
-
-Intuitively, this means that a position embedding vector :math:`x_j \in \mathbb{R}^{d}` is now the composition of two
-factorized embedding vectors: :math:`x^1_{k, l} + x^2_{l, k}`, where as the :obj:`config.max_embedding_size` dimension
-:math:`j` is factorized into :math:`k \text{ and } l`. This design ensures that each position embedding vector
-:math:`x_j` is unique.
-
-Using the above example again, axial position encoding with :math:`d^1 = 2^5, d^2 = 2^5, n_s^1 = 2^9, n_s^2 = 2^{10}`
-can drastically reduced the number of parameters to :math:`2^{14} + 2^{15} \approx 49000` parameters.
-
-In practice, the parameter :obj:`config.axial_pos_embds_dim` is set to a tuple :math:`(d^1, d^2)` which sum has to be
-equal to :obj:`config.hidden_size` and :obj:`config.axial_pos_shape` is set to a tuple :math:`(n_s^1, n_s^2)` which
-product has to be equal to :obj:`config.max_embedding_size`, which during training has to be equal to the `sequence
-length` of the :obj:`input_ids`.
-
-
-LSH Self Attention
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-In Locality sensitive hashing (LSH) self attention the key and query projection weights are tied. Therefore, the key
-query embedding vectors are also tied. LSH self attention uses the locality sensitive hashing mechanism proposed in
-`Practical and Optimal LSH for Angular Distance <https://arxiv.org/abs/1509.02897>`__ to assign each of the tied key
-query embedding vectors to one of :obj:`config.num_buckets` possible buckets. The premise is that the more "similar"
-key query embedding vectors (in terms of *cosine similarity*) are to each other, the more likely they are assigned to
-the same bucket.
-
-The accuracy of the LSH mechanism can be improved by increasing :obj:`config.num_hashes` or directly the argument
-:obj:`num_hashes` of the forward function so that the output of the LSH self attention better approximates the output
-of the "normal" full self attention. The buckets are then sorted and chunked into query key embedding vector chunks
-each of length :obj:`config.lsh_chunk_length`. For each chunk, the query embedding vectors attend to its key vectors
-(which are tied to themselves) and to the key embedding vectors of :obj:`config.lsh_num_chunks_before` previous
-neighboring chunks and :obj:`config.lsh_num_chunks_after` following neighboring chunks.
-
-For more information, see the `original Paper <https://arxiv.org/abs/2001.04451>`__ or this great `blog post
-<https://www.pragmatic.ml/reformer-deep-dive/>`__.
-
-Note that :obj:`config.num_buckets` can also be factorized into a list :math:`(n_{\text{buckets}}^1,
-n_{\text{buckets}}^2)`. This way instead of assigning the query key embedding vectors to one of :math:`(1,\ldots,
-n_{\text{buckets}})` they are assigned to one of :math:`(1-1,\ldots, n_{\text{buckets}}^1-1, \ldots,
-1-n_{\text{buckets}}^2, \ldots, n_{\text{buckets}}^1-n_{\text{buckets}}^2)`. This is crucial for very long sequences to
-save memory.
-
-When training a model from scratch, it is recommended to leave :obj:`config.num_buckets=None`, so that depending on the
-sequence length a good value for :obj:`num_buckets` is calculated on the fly. This value will then automatically be
-saved in the config and should be reused for inference.
-
-Using LSH self attention, the memory and time complexity of the query-key matmul operation can be reduced from
-:math:`\mathcal{O}(n_s \times n_s)` to :math:`\mathcal{O}(n_s \times \log(n_s))`, which usually represents the memory
-and time bottleneck in a transformer model, with :math:`n_s` being the sequence length.
-
-
-Local Self Attention
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Local self attention is essentially a "normal" self attention layer with key, query and value projections, but is
-chunked so that in each chunk of length :obj:`config.local_chunk_length` the query embedding vectors only attends to
-the key embedding vectors in its chunk and to the key embedding vectors of :obj:`config.local_num_chunks_before`
-previous neighboring chunks and :obj:`config.local_num_chunks_after` following neighboring chunks.
-
-Using Local self attention, the memory and time complexity of the query-key matmul operation can be reduced from
-:math:`\mathcal{O}(n_s \times n_s)` to :math:`\mathcal{O}(n_s \times \log(n_s))`, which usually represents the memory
-and time bottleneck in a transformer model, with :math:`n_s` being the sequence length.
-
-
-Training
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-During training, we must ensure that the sequence length is set to a value that can be divided by the least common
-multiple of :obj:`config.lsh_chunk_length` and :obj:`config.local_chunk_length` and that the parameters of the Axial
-Positional Encodings are correctly set as described above. Reformer is very memory efficient so that the model can
-easily be trained on sequences as long as 64000 tokens.
-
-For training, the :class:`~transformers.ReformerModelWithLMHead` should be used as follows:
-
-.. code-block::
-
-    input_ids = tokenizer.encode('This is a sentence from the training data', return_tensors='pt')
-    loss = model(input_ids, labels=input_ids)[0]
-
-
-ReformerConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ReformerConfig
-    :members:
-
-
-ReformerTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ReformerTokenizer
-    :members: save_vocabulary
-
-
-ReformerTokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ReformerTokenizerFast
-    :members:
-
-
-ReformerModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ReformerModel
-    :members: forward
-
-
-ReformerModelWithLMHead
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ReformerModelWithLMHead
-    :members: forward
-
-
-ReformerForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ReformerForMaskedLM
-    :members: forward
-
-
-ReformerForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ReformerForSequenceClassification
-    :members: forward
-
-
-ReformerForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ReformerForQuestionAnswering
-    :members: forward
diff --git a/docs/source/model_doc/rembert.mdx b/docs/source/model_doc/rembert.mdx
new file mode 100644
index 0000000000..0edb8e5202
--- /dev/null
+++ b/docs/source/model_doc/rembert.mdx
@@ -0,0 +1,128 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# RemBERT
+
+## Overview
+
+The RemBERT model was proposed in [Rethinking Embedding Coupling in Pre-trained Language Models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, Melvin Johnson, Sebastian Ruder.
+
+The abstract from the paper is the following:
+
+*We re-evaluate the standard practice of sharing weights between input and output embeddings in state-of-the-art
+pre-trained language models. We show that decoupled embeddings provide increased modeling flexibility, allowing us to
+significantly improve the efficiency of parameter allocation in the input embedding of multilingual models. By
+reallocating the input embedding parameters in the Transformer layers, we achieve dramatically better performance on
+standard natural language understanding tasks with the same number of parameters during fine-tuning. We also show that
+allocating additional capacity to the output embedding provides benefits to the model that persist through the
+fine-tuning stage even though the output embedding is discarded after pre-training. Our analysis shows that larger
+output embeddings prevent the model's last layers from overspecializing to the pre-training task and encourage
+Transformer representations to be more general and more transferable to other tasks and languages. Harnessing these
+findings, we are able to train models that achieve strong performance on the XTREME benchmark without increasing the
+number of parameters at the fine-tuning stage.*
+
+Tips:
+
+For fine-tuning, RemBERT can be thought of as a bigger version of mBERT with an ALBERT-like factorization of the
+embedding layer. The embeddings are not tied in pre-training, in contrast with BERT, which enables smaller input
+embeddings (preserved during fine-tuning) and bigger output embeddings (discarded at fine-tuning). The tokenizer is
+also similar to the Albert one rather than the BERT one.
+
+## RemBertConfig
+
+[[autodoc]] RemBertConfig
+
+## RemBertTokenizer
+
+[[autodoc]] RemBertTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## RemBertTokenizerFast
+
+[[autodoc]] RemBertTokenizerFast
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## RemBertModel
+
+[[autodoc]] RemBertModel
+    - forward
+
+## RemBertForCausalLM
+
+[[autodoc]] RemBertForCausalLM
+    - forward
+
+## RemBertForMaskedLM
+
+[[autodoc]] RemBertForMaskedLM
+    - forward
+
+## RemBertForSequenceClassification
+
+[[autodoc]] RemBertForSequenceClassification
+    - forward
+
+## RemBertForMultipleChoice
+
+[[autodoc]] RemBertForMultipleChoice
+    - forward
+
+## RemBertForTokenClassification
+
+[[autodoc]] RemBertForTokenClassification
+    - forward
+
+## RemBertForQuestionAnswering
+
+[[autodoc]] RemBertForQuestionAnswering
+    - forward
+
+## TFRemBertModel
+
+[[autodoc]] TFRemBertModel
+    - call
+
+## TFRemBertForMaskedLM
+
+[[autodoc]] TFRemBertForMaskedLM
+    - call
+
+## TFRemBertForCausalLM
+
+[[autodoc]] TFRemBertForCausalLM
+    - call
+
+## TFRemBertForSequenceClassification
+
+[[autodoc]] TFRemBertForSequenceClassification
+    - call
+
+## TFRemBertForMultipleChoice
+
+[[autodoc]] TFRemBertForMultipleChoice
+    - call
+
+## TFRemBertForTokenClassification
+
+[[autodoc]] TFRemBertForTokenClassification
+    - call
+
+## TFRemBertForQuestionAnswering
+
+[[autodoc]] TFRemBertForQuestionAnswering
+    - call
diff --git a/docs/source/model_doc/rembert.rst b/docs/source/model_doc/rembert.rst
deleted file mode 100644
index 1c1678e3b8..0000000000
--- a/docs/source/model_doc/rembert.rst
+++ /dev/null
@@ -1,161 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-RemBERT
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The RemBERT model was proposed in `Rethinking Embedding Coupling in Pre-trained Language Models
-<https://arxiv.org/abs/2010.12821>`__ by Hyung Won Chung, Thibault Févry, Henry Tsai, Melvin Johnson, Sebastian Ruder.
-
-The abstract from the paper is the following:
-
-*We re-evaluate the standard practice of sharing weights between input and output embeddings in state-of-the-art
-pre-trained language models. We show that decoupled embeddings provide increased modeling flexibility, allowing us to
-significantly improve the efficiency of parameter allocation in the input embedding of multilingual models. By
-reallocating the input embedding parameters in the Transformer layers, we achieve dramatically better performance on
-standard natural language understanding tasks with the same number of parameters during fine-tuning. We also show that
-allocating additional capacity to the output embedding provides benefits to the model that persist through the
-fine-tuning stage even though the output embedding is discarded after pre-training. Our analysis shows that larger
-output embeddings prevent the model's last layers from overspecializing to the pre-training task and encourage
-Transformer representations to be more general and more transferable to other tasks and languages. Harnessing these
-findings, we are able to train models that achieve strong performance on the XTREME benchmark without increasing the
-number of parameters at the fine-tuning stage.*
-
-Tips:
-
-For fine-tuning, RemBERT can be thought of as a bigger version of mBERT with an ALBERT-like factorization of the
-embedding layer. The embeddings are not tied in pre-training, in contrast with BERT, which enables smaller input
-embeddings (preserved during fine-tuning) and bigger output embeddings (discarded at fine-tuning). The tokenizer is
-also similar to the Albert one rather than the BERT one.
-
-RemBertConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RemBertConfig
-    :members:
-
-
-RemBertTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RemBertTokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-RemBertTokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RemBertTokenizerFast
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-RemBertModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RemBertModel
-    :members: forward
-
-
-RemBertForCausalLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RemBertForCausalLM
-    :members: forward
-
-
-RemBertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RemBertForMaskedLM
-    :members: forward
-
-
-RemBertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RemBertForSequenceClassification
-    :members: forward
-
-
-RemBertForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RemBertForMultipleChoice
-    :members: forward
-
-
-RemBertForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RemBertForTokenClassification
-    :members: forward
-
-
-RemBertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RemBertForQuestionAnswering
-    :members: forward
-
-
-TFRemBertModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRemBertModel
-    :members: call
-
-
-TFRemBertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRemBertForMaskedLM
-    :members: call
-
-
-TFRemBertForCausalLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRemBertForCausalLM
-    :members: call
-
-
-TFRemBertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRemBertForSequenceClassification
-    :members: call
-
-
-TFRemBertForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRemBertForMultipleChoice
-    :members: call
-
-
-TFRemBertForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRemBertForTokenClassification
-    :members: call
-
-
-TFRemBertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRemBertForQuestionAnswering
-    :members: call
diff --git a/docs/source/model_doc/retribert.mdx b/docs/source/model_doc/retribert.mdx
new file mode 100644
index 0000000000..cb717767bf
--- /dev/null
+++ b/docs/source/model_doc/retribert.mdx
@@ -0,0 +1,40 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# RetriBERT
+
+## Overview
+
+The RetriBERT model was proposed in the blog post [Explain Anything Like I'm Five: A Model for Open Domain Long Form
+Question Answering](https://yjernite.github.io/lfqa.html). RetriBERT is a small model that uses either a single or
+pair of BERT encoders with lower-dimension projection for dense semantic indexing of text.
+
+This model was contributed by [yjernite](https://huggingface.co/yjernite). Code to train and use the model can be
+found [here](https://github.com/huggingface/transformers/tree/master/examples/research-projects/distillation).
+
+
+## RetriBertConfig
+
+[[autodoc]] RetriBertConfig
+
+## RetriBertTokenizer
+
+[[autodoc]] RetriBertTokenizer
+
+## RetriBertTokenizerFast
+
+[[autodoc]] RetriBertTokenizerFast
+
+## RetriBertModel
+
+[[autodoc]] RetriBertModel
+    - forward
diff --git a/docs/source/model_doc/retribert.rst b/docs/source/model_doc/retribert.rst
deleted file mode 100644
index 568f7f2a34..0000000000
--- a/docs/source/model_doc/retribert.rst
+++ /dev/null
@@ -1,52 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-RetriBERT
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The RetriBERT model was proposed in the blog post `Explain Anything Like I'm Five: A Model for Open Domain Long Form
-Question Answering <https://yjernite.github.io/lfqa.html>`__. RetriBERT is a small model that uses either a single or
-pair of BERT encoders with lower-dimension projection for dense semantic indexing of text.
-
-This model was contributed by `yjernite <https://huggingface.co/yjernite>`__. Code to train and use the model can be
-found :prefix_link:`here <examples/research-projects/distillation>`.
-
-
-RetriBertConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RetriBertConfig
-    :members:
-
-
-RetriBertTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RetriBertTokenizer
-    :members:
-
-
-RetriBertTokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RetriBertTokenizerFast
-    :members:
-
-
-RetriBertModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RetriBertModel
-    :members: forward
diff --git a/docs/source/model_doc/roberta.mdx b/docs/source/model_doc/roberta.mdx
new file mode 100644
index 0000000000..9db63e93a4
--- /dev/null
+++ b/docs/source/model_doc/roberta.mdx
@@ -0,0 +1,162 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# RoBERTa
+
+## Overview
+
+The RoBERTa model was proposed in [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer
+Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. It is based on Google's BERT model released in 2018.
+
+It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining objective and training with
+much larger mini-batches and learning rates.
+
+The abstract from the paper is the following:
+
+*Language model pretraining has led to significant performance gains but careful comparison between different
+approaches is challenging. Training is computationally expensive, often done on private datasets of different sizes,
+and, as we will show, hyperparameter choices have significant impact on the final results. We present a replication
+study of BERT pretraining (Devlin et al., 2019) that carefully measures the impact of many key hyperparameters and
+training data size. We find that BERT was significantly undertrained, and can match or exceed the performance of every
+model published after it. Our best model achieves state-of-the-art results on GLUE, RACE and SQuAD. These results
+highlight the importance of previously overlooked design choices, and raise questions about the source of recently
+reported improvements. We release our models and code.*
+
+Tips:
+
+- This implementation is the same as [`BertModel`] with a tiny embeddings tweak as well as a setup
+  for Roberta pretrained models.
+- RoBERTa has the same architecture as BERT, but uses a byte-level BPE as a tokenizer (same as GPT-2) and uses a
+  different pretraining scheme.
+- RoBERTa doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just
+  separate your segments with the separation token `tokenizer.sep_token` (or `</s>`)
+- [CamemBERT](camembert) is a wrapper around RoBERTa. Refer to this page for usage examples.
+
+This model was contributed by [julien-c](https://huggingface.co/julien-c). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/roberta).
+
+
+## RobertaConfig
+
+[[autodoc]] RobertaConfig
+
+## RobertaTokenizer
+
+[[autodoc]] RobertaTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## RobertaTokenizerFast
+
+[[autodoc]] RobertaTokenizerFast
+    - build_inputs_with_special_tokens
+
+## RobertaModel
+
+[[autodoc]] RobertaModel
+    - forward
+
+## RobertaForCausalLM
+
+[[autodoc]] RobertaForCausalLM
+    - forward
+
+## RobertaForMaskedLM
+
+[[autodoc]] RobertaForMaskedLM
+    - forward
+
+## RobertaForSequenceClassification
+
+[[autodoc]] RobertaForSequenceClassification
+    - forward
+
+## RobertaForMultipleChoice
+
+[[autodoc]] RobertaForMultipleChoice
+    - forward
+
+## RobertaForTokenClassification
+
+[[autodoc]] RobertaForTokenClassification
+    - forward
+
+## RobertaForQuestionAnswering
+
+[[autodoc]] RobertaForQuestionAnswering
+    - forward
+
+## TFRobertaModel
+
+[[autodoc]] TFRobertaModel
+    - call
+
+## TFRobertaForCausalLM
+
+[[autodoc]] TFRobertaForCausalLM
+    - call
+
+## TFRobertaForMaskedLM
+
+[[autodoc]] TFRobertaForMaskedLM
+    - call
+
+## TFRobertaForSequenceClassification
+
+[[autodoc]] TFRobertaForSequenceClassification
+    - call
+
+## TFRobertaForMultipleChoice
+
+[[autodoc]] TFRobertaForMultipleChoice
+    - call
+
+## TFRobertaForTokenClassification
+
+[[autodoc]] TFRobertaForTokenClassification
+    - call
+
+## TFRobertaForQuestionAnswering
+
+[[autodoc]] TFRobertaForQuestionAnswering
+    - call
+
+## FlaxRobertaModel
+
+[[autodoc]] FlaxRobertaModel
+    - __call__
+
+## FlaxRobertaForMaskedLM
+
+[[autodoc]] FlaxRobertaForMaskedLM
+    - __call__
+
+## FlaxRobertaForSequenceClassification
+
+[[autodoc]] FlaxRobertaForSequenceClassification
+    - __call__
+
+## FlaxRobertaForMultipleChoice
+
+[[autodoc]] FlaxRobertaForMultipleChoice
+    - __call__
+
+## FlaxRobertaForTokenClassification
+
+[[autodoc]] FlaxRobertaForTokenClassification
+    - __call__
+
+## FlaxRobertaForQuestionAnswering
+
+[[autodoc]] FlaxRobertaForQuestionAnswering
+    - __call__
diff --git a/docs/source/model_doc/roberta.rst b/docs/source/model_doc/roberta.rst
deleted file mode 100644
index b9d533e804..0000000000
--- a/docs/source/model_doc/roberta.rst
+++ /dev/null
@@ -1,210 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-RoBERTa
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The RoBERTa model was proposed in `RoBERTa: A Robustly Optimized BERT Pretraining Approach
-<https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer
-Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. It is based on Google's BERT model released in 2018.
-
-It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining objective and training with
-much larger mini-batches and learning rates.
-
-The abstract from the paper is the following:
-
-*Language model pretraining has led to significant performance gains but careful comparison between different
-approaches is challenging. Training is computationally expensive, often done on private datasets of different sizes,
-and, as we will show, hyperparameter choices have significant impact on the final results. We present a replication
-study of BERT pretraining (Devlin et al., 2019) that carefully measures the impact of many key hyperparameters and
-training data size. We find that BERT was significantly undertrained, and can match or exceed the performance of every
-model published after it. Our best model achieves state-of-the-art results on GLUE, RACE and SQuAD. These results
-highlight the importance of previously overlooked design choices, and raise questions about the source of recently
-reported improvements. We release our models and code.*
-
-Tips:
-
-- This implementation is the same as :class:`~transformers.BertModel` with a tiny embeddings tweak as well as a setup
-  for Roberta pretrained models.
-- RoBERTa has the same architecture as BERT, but uses a byte-level BPE as a tokenizer (same as GPT-2) and uses a
-  different pretraining scheme.
-- RoBERTa doesn't have :obj:`token_type_ids`, you don't need to indicate which token belongs to which segment. Just
-  separate your segments with the separation token :obj:`tokenizer.sep_token` (or :obj:`</s>`)
-- :doc:`CamemBERT <camembert>` is a wrapper around RoBERTa. Refer to this page for usage examples.
-
-This model was contributed by `julien-c <https://huggingface.co/julien-c>`__. The original code can be found `here
-<https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_.
-
-
-RobertaConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaConfig
-    :members:
-
-
-RobertaTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaTokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-RobertaTokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaTokenizerFast
-    :members: build_inputs_with_special_tokens
-
-
-RobertaModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaModel
-    :members: forward
-
-
-RobertaForCausalLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaForCausalLM
-    :members: forward
-
-
-RobertaForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaForMaskedLM
-    :members: forward
-
-
-RobertaForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaForSequenceClassification
-    :members: forward
-
-
-RobertaForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaForMultipleChoice
-    :members: forward
-
-
-RobertaForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaForTokenClassification
-    :members: forward
-
-
-RobertaForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaForQuestionAnswering
-    :members: forward
-
-
-TFRobertaModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRobertaModel
-    :members: call
-
-
-TFRobertaForCausalLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRobertaForCausalLM
-    :members: call
-
-
-TFRobertaForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRobertaForMaskedLM
-    :members: call
-
-
-TFRobertaForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRobertaForSequenceClassification
-    :members: call
-
-
-TFRobertaForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRobertaForMultipleChoice
-    :members: call
-
-
-TFRobertaForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRobertaForTokenClassification
-    :members: call
-
-
-TFRobertaForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRobertaForQuestionAnswering
-    :members: call
-
-
-FlaxRobertaModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxRobertaModel
-    :members: __call__
-
-
-FlaxRobertaForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxRobertaForMaskedLM
-    :members: __call__
-
-
-FlaxRobertaForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxRobertaForSequenceClassification
-    :members: __call__
-
-
-FlaxRobertaForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxRobertaForMultipleChoice
-    :members: __call__
-
-
-FlaxRobertaForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxRobertaForTokenClassification
-    :members: __call__
-
-
-FlaxRobertaForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxRobertaForQuestionAnswering
-    :members: __call__
diff --git a/docs/source/model_doc/roformer.mdx b/docs/source/model_doc/roformer.mdx
new file mode 100644
index 0000000000..a00da9ef9b
--- /dev/null
+++ b/docs/source/model_doc/roformer.mdx
@@ -0,0 +1,125 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# RoFormer
+
+## Overview
+
+The RoFormer model was proposed in [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+
+The abstract from the paper is the following:
+
+*Position encoding in transformer architecture provides supervision for dependency modeling between elements at
+different positions in the sequence. We investigate various methods to encode positional information in
+transformer-based language models and propose a novel implementation named Rotary Position Embedding(RoPE). The
+proposed RoPE encodes absolute positional information with rotation matrix and naturally incorporates explicit relative
+position dependency in self-attention formulation. Notably, RoPE comes with valuable properties such as flexibility of
+being expand to any sequence lengths, decaying inter-token dependency with increasing relative distances, and
+capability of equipping the linear self-attention with relative position encoding. As a result, the enhanced
+transformer with rotary position embedding, or RoFormer, achieves superior performance in tasks with long texts. We
+release the theoretical analysis along with some preliminary experiment results on Chinese data. The undergoing
+experiment for English benchmark will soon be updated.*
+
+Tips:
+
+- RoFormer is a BERT-like autoencoding model with rotary position embeddings. Rotary position embeddings have shown
+  improved performance on classification tasks with long texts.
+
+
+This model was contributed by [junnyu](https://huggingface.co/junnyu). The original code can be found [here](https://github.com/ZhuiyiTechnology/roformer).
+
+## RoFormerConfig
+
+[[autodoc]] RoFormerConfig
+
+## RoFormerTokenizer
+
+[[autodoc]] RoFormerTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## RoFormerTokenizerFast
+
+[[autodoc]] RoFormerTokenizerFast
+    - build_inputs_with_special_tokens
+
+## RoFormerModel
+
+[[autodoc]] RoFormerModel
+    - forward
+
+## RoFormerForCausalLM
+
+[[autodoc]] RoFormerForCausalLM
+    - forward
+
+## RoFormerForMaskedLM
+
+[[autodoc]] RoFormerForMaskedLM
+    - forward
+
+## RoFormerForSequenceClassification
+
+[[autodoc]] RoFormerForSequenceClassification
+    - forward
+
+## RoFormerForMultipleChoice
+
+[[autodoc]] RoFormerForMultipleChoice
+    - forward
+
+## RoFormerForTokenClassification
+
+[[autodoc]] RoFormerForTokenClassification
+    - forward
+
+## RoFormerForQuestionAnswering
+
+[[autodoc]] RoFormerForQuestionAnswering
+    - forward
+
+## TFRoFormerModel
+
+[[autodoc]] TFRoFormerModel
+    - call
+
+## TFRoFormerForMaskedLM
+
+[[autodoc]] TFRoFormerForMaskedLM
+    - call
+
+## TFRoFormerForCausalLM
+
+[[autodoc]] TFRoFormerForCausalLM
+    - call
+
+## TFRoFormerForSequenceClassification
+
+[[autodoc]] TFRoFormerForSequenceClassification
+    - call
+
+## TFRoFormerForMultipleChoice
+
+[[autodoc]] TFRoFormerForMultipleChoice
+    - call
+
+## TFRoFormerForTokenClassification
+
+[[autodoc]] TFRoFormerForTokenClassification
+    - call
+
+## TFRoFormerForQuestionAnswering
+
+[[autodoc]] TFRoFormerForQuestionAnswering
+    - call
diff --git a/docs/source/model_doc/roformer.rst b/docs/source/model_doc/roformer.rst
deleted file mode 100644
index 21f1fe6bbe..0000000000
--- a/docs/source/model_doc/roformer.rst
+++ /dev/null
@@ -1,161 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-RoFormer
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The RoFormer model was proposed in `RoFormer: Enhanced Transformer with Rotary Position Embedding
-<https://arxiv.org/pdf/2104.09864v1.pdf>`__ by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-
-The abstract from the paper is the following:
-
-*Position encoding in transformer architecture provides supervision for dependency modeling between elements at
-different positions in the sequence. We investigate various methods to encode positional information in
-transformer-based language models and propose a novel implementation named Rotary Position Embedding(RoPE). The
-proposed RoPE encodes absolute positional information with rotation matrix and naturally incorporates explicit relative
-position dependency in self-attention formulation. Notably, RoPE comes with valuable properties such as flexibility of
-being expand to any sequence lengths, decaying inter-token dependency with increasing relative distances, and
-capability of equipping the linear self-attention with relative position encoding. As a result, the enhanced
-transformer with rotary position embedding, or RoFormer, achieves superior performance in tasks with long texts. We
-release the theoretical analysis along with some preliminary experiment results on Chinese data. The undergoing
-experiment for English benchmark will soon be updated.*
-
-Tips:
-
-- RoFormer is a BERT-like autoencoding model with rotary position embeddings. Rotary position embeddings have shown
-  improved performance on classification tasks with long texts.
-
-
-This model was contributed by `junnyu <https://huggingface.co/junnyu>`__. The original code can be found `here
-<https://github.com/ZhuiyiTechnology/roformer>`__.
-
-RoFormerConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RoFormerConfig
-    :members:
-
-
-RoFormerTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RoFormerTokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-RoFormerTokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RoFormerTokenizerFast
-    :members: build_inputs_with_special_tokens
-
-
-RoFormerModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RoFormerModel
-    :members: forward
-
-
-RoFormerForCausalLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RoFormerForCausalLM
-    :members: forward
-
-
-RoFormerForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RoFormerForMaskedLM
-    :members: forward
-
-
-RoFormerForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RoFormerForSequenceClassification
-    :members: forward
-
-
-RoFormerForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RoFormerForMultipleChoice
-    :members: forward
-
-
-RoFormerForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RoFormerForTokenClassification
-    :members: forward
-
-
-RoFormerForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RoFormerForQuestionAnswering
-    :members: forward
-
-
-TFRoFormerModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRoFormerModel
-    :members: call
-
-
-TFRoFormerForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRoFormerForMaskedLM
-    :members: call
-
-
-TFRoFormerForCausalLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRoFormerForCausalLM
-    :members: call
-
-
-TFRoFormerForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRoFormerForSequenceClassification
-    :members: call
-
-
-TFRoFormerForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRoFormerForMultipleChoice
-    :members: call
-
-
-TFRoFormerForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRoFormerForTokenClassification
-    :members: call
-
-
-TFRoFormerForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRoFormerForQuestionAnswering
-    :members: call
diff --git a/docs/source/model_doc/segformer.mdx b/docs/source/model_doc/segformer.mdx
new file mode 100644
index 0000000000..bc053ee52f
--- /dev/null
+++ b/docs/source/model_doc/segformer.mdx
@@ -0,0 +1,104 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# SegFormer
+
+## Overview
+
+The SegFormer model was proposed in [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping
+Luo. The model consists of a hierarchical Transformer encoder and a lightweight all-MLP decode head to achieve great
+results on image segmentation benchmarks such as ADE20K and Cityscapes.
+
+The abstract from the paper is the following:
+
+*We present SegFormer, a simple, efficient yet powerful semantic segmentation framework which unifies Transformers with
+lightweight multilayer perception (MLP) decoders. SegFormer has two appealing features: 1) SegFormer comprises a novel
+hierarchically structured Transformer encoder which outputs multiscale features. It does not need positional encoding,
+thereby avoiding the interpolation of positional codes which leads to decreased performance when the testing resolution
+differs from training. 2) SegFormer avoids complex decoders. The proposed MLP decoder aggregates information from
+different layers, and thus combining both local attention and global attention to render powerful representations. We
+show that this simple and lightweight design is the key to efficient segmentation on Transformers. We scale our
+approach up to obtain a series of models from SegFormer-B0 to SegFormer-B5, reaching significantly better performance
+and efficiency than previous counterparts. For example, SegFormer-B4 achieves 50.3% mIoU on ADE20K with 64M parameters,
+being 5x smaller and 2.2% better than the previous best method. Our best model, SegFormer-B5, achieves 84.0% mIoU on
+Cityscapes validation set and shows excellent zero-shot robustness on Cityscapes-C.*
+
+The figure below illustrates the architecture of SegFormer. Taken from the [original paper](https://arxiv.org/abs/2105.15203).
+
+<img width="600" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/segformer_architecture.png"/>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/NVlabs/SegFormer).
+
+Tips:
+
+- SegFormer consists of a hierarchical Transformer encoder, and a lightweight all-MLP decode head.
+  [`SegformerModel`] is the hierarchical Transformer encoder (which in the paper is also referred to
+  as Mix Transformer or MiT). [`SegformerForSemanticSegmentation`] adds the all-MLP decode head on
+  top to perform semantic segmentation of images. In addition, there's
+  [`SegformerForImageClassification`] which can be used to - you guessed it - classify images. The
+  authors of SegFormer first pre-trained the Transformer encoder on ImageNet-1k to classify images. Next, they throw
+  away the classification head, and replace it by the all-MLP decode head. Next, they fine-tune the model altogether on
+  ADE20K, Cityscapes and COCO-stuff, which are important benchmarks for semantic segmentation. All checkpoints can be
+  found on the [hub](https://huggingface.co/models?other=segformer).
+- The quickest way to get started with SegFormer is by checking the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SegFormer) (which showcase both inference and
+  fine-tuning on custom data).
+- One can use [`SegformerFeatureExtractor`] to prepare images and corresponding segmentation maps
+  for the model. Note that this feature extractor is fairly basic and does not include all data augmentations used in
+  the original paper. The original preprocessing pipelines (for the ADE20k dataset for instance) can be found [here](https://github.com/NVlabs/SegFormer/blob/master/local_configs/_base_/datasets/ade20k_repeat.py). The most
+  important preprocessing step is that images and segmentation maps are randomly cropped and padded to the same size,
+  such as 512x512 or 640x640, after which they are normalized.
+- One additional thing to keep in mind is that one can initialize [`SegformerFeatureExtractor`] with
+  `reduce_labels` set to *True* or *False*. In some datasets (like ADE20k), the 0 index is used in the annotated
+  segmentation maps for background. However, ADE20k doesn't include the "background" class in its 150 labels.
+  Therefore, `reduce_labels` is used to reduce all labels by 1, and to make sure no loss is computed for the
+  background class (i.e. it replaces 0 in the annotated maps by 255, which is the *ignore_index* of the loss function
+  used by [`SegformerForSemanticSegmentation`]). However, other datasets use the 0 index as
+  background class and include this class as part of all labels. In that case, `reduce_labels` should be set to
+  *False*, as loss should also be computed for the background class.
+- As most models, SegFormer comes in different sizes, the details of which can be found in the table below.
+
+| **Model variant** | **Depths**    | **Hidden sizes**    | **Decoder hidden size** | **Params (M)** | **ImageNet-1k Top 1** |
+| MiT-b0            | [2, 2, 2, 2]  | [32, 64, 160, 256]  | 256                     | 3.7            | 70.5                  |
+| MiT-b1            | [2, 2, 2, 2]  | [64, 128, 320, 512] | 256                     | 14.0           | 78.7                  |
+| MiT-b2            | [3, 4, 6, 3]  | [64, 128, 320, 512] | 768                     | 25.4           | 81.6                  |
+| MiT-b3            | [3, 4, 18, 3] | [64, 128, 320, 512] | 768                     | 45.2           | 83.1                  |
+| MiT-b4            | [3, 8, 27, 3] | [64, 128, 320, 512] | 768                     | 62.6           | 83.6                  |
+| MiT-b5            | [3, 6, 40, 3] | [64, 128, 320, 512] | 768                     | 82.0           | 83.8                  |
+
+## SegformerConfig
+
+[[autodoc]] SegformerConfig
+
+## SegformerFeatureExtractor
+
+[[autodoc]] SegformerFeatureExtractor
+    - __call__
+
+## SegformerModel
+
+[[autodoc]] SegformerModel
+    - forward
+
+## SegformerDecodeHead
+
+[[autodoc]] SegformerDecodeHead
+    - forward
+
+## SegformerForImageClassification
+
+[[autodoc]] SegformerForImageClassification
+    - forward
+
+## SegformerForSemanticSegmentation
+
+[[autodoc]] SegformerForSemanticSegmentation
+    - forward
diff --git a/docs/source/model_doc/segformer.rst b/docs/source/model_doc/segformer.rst
deleted file mode 100644
index 342b393576..0000000000
--- a/docs/source/model_doc/segformer.rst
+++ /dev/null
@@ -1,132 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-SegFormer
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The SegFormer model was proposed in `SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers
-<https://arxiv.org/abs/2105.15203>`__ by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping
-Luo. The model consists of a hierarchical Transformer encoder and a lightweight all-MLP decode head to achieve great
-results on image segmentation benchmarks such as ADE20K and Cityscapes.
-
-The abstract from the paper is the following:
-
-*We present SegFormer, a simple, efficient yet powerful semantic segmentation framework which unifies Transformers with
-lightweight multilayer perception (MLP) decoders. SegFormer has two appealing features: 1) SegFormer comprises a novel
-hierarchically structured Transformer encoder which outputs multiscale features. It does not need positional encoding,
-thereby avoiding the interpolation of positional codes which leads to decreased performance when the testing resolution
-differs from training. 2) SegFormer avoids complex decoders. The proposed MLP decoder aggregates information from
-different layers, and thus combining both local attention and global attention to render powerful representations. We
-show that this simple and lightweight design is the key to efficient segmentation on Transformers. We scale our
-approach up to obtain a series of models from SegFormer-B0 to SegFormer-B5, reaching significantly better performance
-and efficiency than previous counterparts. For example, SegFormer-B4 achieves 50.3% mIoU on ADE20K with 64M parameters,
-being 5x smaller and 2.2% better than the previous best method. Our best model, SegFormer-B5, achieves 84.0% mIoU on
-Cityscapes validation set and shows excellent zero-shot robustness on Cityscapes-C.*
-
-The figure below illustrates the architecture of SegFormer. Taken from the `original paper
-<https://arxiv.org/abs/2105.15203>`__.
-
-.. image:: https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/segformer_architecture.png
-  :width: 600
-
-This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code can be found `here
-<https://github.com/NVlabs/SegFormer>`__.
-
-Tips:
-
-- SegFormer consists of a hierarchical Transformer encoder, and a lightweight all-MLP decode head.
-  :class:`~transformers.SegformerModel` is the hierarchical Transformer encoder (which in the paper is also referred to
-  as Mix Transformer or MiT). :class:`~transformers.SegformerForSemanticSegmentation` adds the all-MLP decode head on
-  top to perform semantic segmentation of images. In addition, there's
-  :class:`~transformers.SegformerForImageClassification` which can be used to - you guessed it - classify images. The
-  authors of SegFormer first pre-trained the Transformer encoder on ImageNet-1k to classify images. Next, they throw
-  away the classification head, and replace it by the all-MLP decode head. Next, they fine-tune the model altogether on
-  ADE20K, Cityscapes and COCO-stuff, which are important benchmarks for semantic segmentation. All checkpoints can be
-  found on the `hub <https://huggingface.co/models?other=segformer>`__.
-- The quickest way to get started with SegFormer is by checking the `example notebooks
-  <https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SegFormer>`__ (which showcase both inference and
-  fine-tuning on custom data).
-- One can use :class:`~transformers.SegformerFeatureExtractor` to prepare images and corresponding segmentation maps
-  for the model. Note that this feature extractor is fairly basic and does not include all data augmentations used in
-  the original paper. The original preprocessing pipelines (for the ADE20k dataset for instance) can be found `here
-  <https://github.com/NVlabs/SegFormer/blob/master/local_configs/_base_/datasets/ade20k_repeat.py>`__. The most
-  important preprocessing step is that images and segmentation maps are randomly cropped and padded to the same size,
-  such as 512x512 or 640x640, after which they are normalized.
-- One additional thing to keep in mind is that one can initialize :class:`~transformers.SegformerFeatureExtractor` with
-  :obj:`reduce_labels` set to `True` or `False`. In some datasets (like ADE20k), the 0 index is used in the annotated
-  segmentation maps for background. However, ADE20k doesn't include the "background" class in its 150 labels.
-  Therefore, :obj:`reduce_labels` is used to reduce all labels by 1, and to make sure no loss is computed for the
-  background class (i.e. it replaces 0 in the annotated maps by 255, which is the `ignore_index` of the loss function
-  used by :class:`~transformers.SegformerForSemanticSegmentation`). However, other datasets use the 0 index as
-  background class and include this class as part of all labels. In that case, :obj:`reduce_labels` should be set to
-  `False`, as loss should also be computed for the background class.
-- As most models, SegFormer comes in different sizes, the details of which can be found in the table below.
-
-+-------------------+---------------+---------------------+-------------------------+----------------+-----------------------+
-| **Model variant** | **Depths**    | **Hidden sizes**    | **Decoder hidden size** | **Params (M)** | **ImageNet-1k Top 1** |
-+-------------------+---------------+---------------------+-------------------------+----------------+-----------------------+
-| MiT-b0            | [2, 2, 2, 2]  | [32, 64, 160, 256]  | 256                     | 3.7            | 70.5                  |
-+-------------------+---------------+---------------------+-------------------------+----------------+-----------------------+
-| MiT-b1            | [2, 2, 2, 2]  | [64, 128, 320, 512] | 256                     | 14.0           | 78.7                  |
-+-------------------+---------------+---------------------+-------------------------+----------------+-----------------------+
-| MiT-b2            | [3, 4, 6, 3]  | [64, 128, 320, 512] | 768                     | 25.4           | 81.6                  |
-+-------------------+---------------+---------------------+-------------------------+----------------+-----------------------+
-| MiT-b3            | [3, 4, 18, 3] | [64, 128, 320, 512] | 768                     | 45.2           | 83.1                  |
-+-------------------+---------------+---------------------+-------------------------+----------------+-----------------------+
-| MiT-b4            | [3, 8, 27, 3] | [64, 128, 320, 512] | 768                     | 62.6           | 83.6                  |
-+-------------------+---------------+---------------------+-------------------------+----------------+-----------------------+
-| MiT-b5            | [3, 6, 40, 3] | [64, 128, 320, 512] | 768                     | 82.0           | 83.8                  |
-+-------------------+---------------+---------------------+-------------------------+----------------+-----------------------+
-
-SegformerConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SegformerConfig
-    :members:
-
-
-SegformerFeatureExtractor
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SegformerFeatureExtractor
-    :members: __call__
-
-
-SegformerModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SegformerModel
-    :members: forward
-
-
-SegformerDecodeHead
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SegformerDecodeHead
-    :members: forward
-
-
-SegformerForImageClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SegformerForImageClassification
-    :members: forward
-
-
-SegformerForSemanticSegmentation
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SegformerForSemanticSegmentation
-    :members: forward
diff --git a/docs/source/model_doc/sew.mdx b/docs/source/model_doc/sew.mdx
new file mode 100644
index 0000000000..dce949a856
--- /dev/null
+++ b/docs/source/model_doc/sew.mdx
@@ -0,0 +1,57 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# SEW
+
+## Overview
+
+SEW (Squeezed and Efficient Wav2Vec) was proposed in [Performance-Efficiency Trade-offs in Unsupervised Pre-training
+for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q.
+Weinberger, Yoav Artzi.
+
+The abstract from the paper is the following:
+
+*This paper is a study of performance-efficiency trade-offs in pre-trained models for automatic speech recognition
+(ASR). We focus on wav2vec 2.0, and formalize several architecture designs that influence both the model performance
+and its efficiency. Putting together all our observations, we introduce SEW (Squeezed and Efficient Wav2vec), a
+pre-trained model architecture with significant improvements along both performance and efficiency dimensions across a
+variety of training setups. For example, under the 100h-960h semi-supervised setup on LibriSpeech, SEW achieves a 1.9x
+inference speedup compared to wav2vec 2.0, with a 13.5% relative reduction in word error rate. With a similar inference
+time, SEW reduces word error rate by 25-50% across different model sizes.*
+
+Tips:
+
+- SEW is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
+- SEWForCTC is fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded using
+  [`Wav2Vec2CTCTokenizer`].
+
+This model was contributed by [anton-l](https://huggingface.co/anton-l).
+
+
+## SEWConfig
+
+[[autodoc]] SEWConfig
+
+## SEWModel
+
+[[autodoc]] SEWModel
+    - forward
+
+## SEWForCTC
+
+[[autodoc]] SEWForCTC
+    - forward
+
+## SEWForSequenceClassification
+
+[[autodoc]] SEWForSequenceClassification
+    - forward
diff --git a/docs/source/model_doc/sew.rst b/docs/source/model_doc/sew.rst
deleted file mode 100644
index f81bbc26dc..0000000000
--- a/docs/source/model_doc/sew.rst
+++ /dev/null
@@ -1,67 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-SEW
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-SEW (Squeezed and Efficient Wav2Vec) was proposed in `Performance-Efficiency Trade-offs in Unsupervised Pre-training
-for Speech Recognition <https://arxiv.org/abs/2109.06870>`__ by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q.
-Weinberger, Yoav Artzi.
-
-The abstract from the paper is the following:
-
-*This paper is a study of performance-efficiency trade-offs in pre-trained models for automatic speech recognition
-(ASR). We focus on wav2vec 2.0, and formalize several architecture designs that influence both the model performance
-and its efficiency. Putting together all our observations, we introduce SEW (Squeezed and Efficient Wav2vec), a
-pre-trained model architecture with significant improvements along both performance and efficiency dimensions across a
-variety of training setups. For example, under the 100h-960h semi-supervised setup on LibriSpeech, SEW achieves a 1.9x
-inference speedup compared to wav2vec 2.0, with a 13.5% relative reduction in word error rate. With a similar inference
-time, SEW reduces word error rate by 25-50% across different model sizes.*
-
-Tips:
-
-- SEW is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
-- SEWForCTC is fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded using
-  :class:`~transformers.Wav2Vec2CTCTokenizer`.
-
-This model was contributed by `anton-l <https://huggingface.co/anton-l>`__.
-
-
-SEWConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SEWConfig
-    :members:
-
-
-SEWModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SEWModel
-    :members: forward
-
-
-SEWForCTC
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SEWForCTC
-    :members: forward
-
-
-SEWForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SEWForSequenceClassification
-    :members: forward
diff --git a/docs/source/model_doc/sew_d.mdx b/docs/source/model_doc/sew_d.mdx
new file mode 100644
index 0000000000..ceeb4f1ec3
--- /dev/null
+++ b/docs/source/model_doc/sew_d.mdx
@@ -0,0 +1,57 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# SEW-D
+
+## Overview
+
+SEW-D (Squeezed and Efficient Wav2Vec with Disentangled attention) was proposed in [Performance-Efficiency Trade-offs
+in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim,
+Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+
+The abstract from the paper is the following:
+
+*This paper is a study of performance-efficiency trade-offs in pre-trained models for automatic speech recognition
+(ASR). We focus on wav2vec 2.0, and formalize several architecture designs that influence both the model performance
+and its efficiency. Putting together all our observations, we introduce SEW (Squeezed and Efficient Wav2vec), a
+pre-trained model architecture with significant improvements along both performance and efficiency dimensions across a
+variety of training setups. For example, under the 100h-960h semi-supervised setup on LibriSpeech, SEW achieves a 1.9x
+inference speedup compared to wav2vec 2.0, with a 13.5% relative reduction in word error rate. With a similar inference
+time, SEW reduces word error rate by 25-50% across different model sizes.*
+
+Tips:
+
+- SEW-D is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
+- SEWDForCTC is fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded
+  using [`Wav2Vec2CTCTokenizer`].
+
+This model was contributed by [anton-l](https://huggingface.co/anton-l).
+
+
+## SEWDConfig
+
+[[autodoc]] SEWDConfig
+
+## SEWDModel
+
+[[autodoc]] SEWDModel
+    - forward
+
+## SEWDForCTC
+
+[[autodoc]] SEWDForCTC
+    - forward
+
+## SEWDForSequenceClassification
+
+[[autodoc]] SEWDForSequenceClassification
+    - forward
diff --git a/docs/source/model_doc/sew_d.rst b/docs/source/model_doc/sew_d.rst
deleted file mode 100644
index 809e93bf45..0000000000
--- a/docs/source/model_doc/sew_d.rst
+++ /dev/null
@@ -1,66 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-SEW-D
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-SEW-D (Squeezed and Efficient Wav2Vec with Disentangled attention) was proposed in `Performance-Efficiency Trade-offs
-in Unsupervised Pre-training for Speech Recognition <https://arxiv.org/abs/2109.06870>`__ by Felix Wu, Kwangyoun Kim,
-Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-
-The abstract from the paper is the following:
-
-*This paper is a study of performance-efficiency trade-offs in pre-trained models for automatic speech recognition
-(ASR). We focus on wav2vec 2.0, and formalize several architecture designs that influence both the model performance
-and its efficiency. Putting together all our observations, we introduce SEW (Squeezed and Efficient Wav2vec), a
-pre-trained model architecture with significant improvements along both performance and efficiency dimensions across a
-variety of training setups. For example, under the 100h-960h semi-supervised setup on LibriSpeech, SEW achieves a 1.9x
-inference speedup compared to wav2vec 2.0, with a 13.5% relative reduction in word error rate. With a similar inference
-time, SEW reduces word error rate by 25-50% across different model sizes.*
-
-Tips:
-
-- SEW-D is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
-- SEWDForCTC is fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded
-  using :class:`~transformers.Wav2Vec2CTCTokenizer`.
-
-This model was contributed by `anton-l <https://huggingface.co/anton-l>`__.
-
-
-SEWDConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SEWDConfig
-    :members:
-
-
-SEWDModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SEWDModel
-    :members: forward
-
-
-SEWDForCTC
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SEWDForCTC
-    :members: forward
-
-SEWDForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SEWDForSequenceClassification
-    :members: forward
diff --git a/docs/source/model_doc/speech_to_text.mdx b/docs/source/model_doc/speech_to_text.mdx
new file mode 100644
index 0000000000..58945ac3a6
--- /dev/null
+++ b/docs/source/model_doc/speech_to_text.mdx
@@ -0,0 +1,138 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Speech2Text
+
+## Overview
+
+The Speech2Text model was proposed in [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino. It's a
+transformer-based seq2seq (encoder-decoder) model designed for end-to-end Automatic Speech Recognition (ASR) and Speech
+Translation (ST). It uses a convolutional downsampler to reduce the length of speech inputs by 3/4th before they are
+fed into the encoder. The model is trained with standard autoregressive cross-entropy loss and generates the
+transcripts/translations autoregressively. Speech2Text has been fine-tuned on several datasets for ASR and ST:
+[LibriSpeech](http://www.openslr.org/12), [CoVoST 2](https://github.com/facebookresearch/covost), [MuST-C](https://ict.fbk.eu/must-c/).
+
+This model was contributed by [valhalla](https://huggingface.co/valhalla). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/speech_to_text).
+
+
+## Inference
+
+Speech2Text is a speech model that accepts a float tensor of log-mel filter-bank features extracted from the speech
+signal. It's a transformer-based seq2seq model, so the transcripts/translations are generated autoregressively. The
+`generate()` method can be used for inference.
+
+The [`Speech2TextFeatureExtractor`] class is responsible for extracting the log-mel filter-bank
+features. The [`Speech2TextProcessor`] wraps [`Speech2TextFeatureExtractor`] and
+[`Speech2TextTokenizer`] into a single instance to both extract the input features and decode the
+predicted token ids.
+
+The feature extractor depends on `torchaudio` and the tokenizer depends on `sentencepiece` so be sure to
+install those packages before running the examples. You could either install those as extra speech dependencies with
+`pip install transformers"[speech, sentencepiece]"` or install the packages seperately with `pip install torchaudio sentencepiece`. Also `torchaudio` requires the development version of the [libsndfile](http://www.mega-nerd.com/libsndfile/) package which can be installed via a system package manager. On Ubuntu it can
+be installed as follows: `apt install libsndfile1-dev`
+
+
+- ASR and Speech Translation
+
+```python
+>>> import torch
+>>> from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
+>>> from datasets import load_dataset
+>>> import soundfile as sf
+
+>>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
+>>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
+
+>>> def map_to_array(batch):
+...     speech, _ = sf.read(batch["file"])
+...     batch["speech"] = speech
+...     return batch
+
+>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+>>> ds = ds.map(map_to_array)
+
+>>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
+>>> generated_ids = model.generate(input_ids=inputs["input_features"], attention_mask=inputs["attention_mask"])
+
+>>> transcription = processor.batch_decode(generated_ids)
+```
+
+- Multilingual speech translation
+
+  For multilingual speech translation models, `eos_token_id` is used as the `decoder_start_token_id` and
+  the target language id is forced as the first generated token. To force the target language id as the first
+  generated token, pass the `forced_bos_token_id` parameter to the `generate()` method. The following
+  example shows how to transate English speech to French text using the *facebook/s2t-medium-mustc-multilingual-st*
+  checkpoint.
+
+```python
+>>> import torch
+>>> from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
+>>> from datasets import load_dataset
+>>> import soundfile as sf
+
+>>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
+>>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
+
+>>> def map_to_array(batch):
+...     speech, _ = sf.read(batch["file"])
+...     batch["speech"] = speech
+...     return batch
+
+>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+>>> ds = ds.map(map_to_array)
+
+>>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
+>>> generated_ids = model.generate(input_ids=inputs["input_features"], attention_mask=inputs["attention_mask], forced_bos_token_id=processor.tokenizer.lang_code_to_id["fr"])
+
+>>> translation = processor.batch_decode(generated_ids)
+```
+
+See the [model hub](https://huggingface.co/models?filter=speech_to_text) to look for Speech2Text checkpoints.
+
+
+## Speech2TextConfig
+
+[[autodoc]] Speech2TextConfig
+
+## Speech2TextTokenizer
+
+[[autodoc]] Speech2TextTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## Speech2TextFeatureExtractor
+
+[[autodoc]] Speech2TextFeatureExtractor
+    - __call__
+
+## Speech2TextProcessor
+
+[[autodoc]] Speech2TextProcessor
+    - __call__
+    - from_pretrained
+    - save_pretrained
+    - batch_decode
+    - decode
+    - as_target_processor
+
+## Speech2TextModel
+
+[[autodoc]] Speech2TextModel
+    - forward
+
+## Speech2TextForConditionalGeneration
+
+[[autodoc]] Speech2TextForConditionalGeneration
+    - forward
diff --git a/docs/source/model_doc/speech_to_text.rst b/docs/source/model_doc/speech_to_text.rst
deleted file mode 100644
index aa67672947..0000000000
--- a/docs/source/model_doc/speech_to_text.rst
+++ /dev/null
@@ -1,153 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Speech2Text
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The Speech2Text model was proposed in `fairseq S2T: Fast Speech-to-Text Modeling with fairseq
-<https://arxiv.org/abs/2010.05171>`__ by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino. It's a
-transformer-based seq2seq (encoder-decoder) model designed for end-to-end Automatic Speech Recognition (ASR) and Speech
-Translation (ST). It uses a convolutional downsampler to reduce the length of speech inputs by 3/4th before they are
-fed into the encoder. The model is trained with standard autoregressive cross-entropy loss and generates the
-transcripts/translations autoregressively. Speech2Text has been fine-tuned on several datasets for ASR and ST:
-`LibriSpeech <http://www.openslr.org/12>`__, `CoVoST 2 <https://github.com/facebookresearch/covost>`__, `MuST-C
-<https://ict.fbk.eu/must-c/>`__.
-
-This model was contributed by `valhalla <https://huggingface.co/valhalla>`__. The original code can be found `here
-<https://github.com/pytorch/fairseq/tree/master/examples/speech_to_text>`__.
-
-
-Inference
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Speech2Text is a speech model that accepts a float tensor of log-mel filter-bank features extracted from the speech
-signal. It's a transformer-based seq2seq model, so the transcripts/translations are generated autoregressively. The
-:obj:`generate()` method can be used for inference.
-
-The :class:`~transformers.Speech2TextFeatureExtractor` class is responsible for extracting the log-mel filter-bank
-features. The :class:`~transformers.Speech2TextProcessor` wraps :class:`~transformers.Speech2TextFeatureExtractor` and
-:class:`~transformers.Speech2TextTokenizer` into a single instance to both extract the input features and decode the
-predicted token ids.
-
-The feature extractor depends on :obj:`torchaudio` and the tokenizer depends on :obj:`sentencepiece` so be sure to
-install those packages before running the examples. You could either install those as extra speech dependencies with
-``pip install transformers"[speech, sentencepiece]"`` or install the packages seperately with ``pip install torchaudio
-sentencepiece``. Also ``torchaudio`` requires the development version of the `libsndfile
-<http://www.mega-nerd.com/libsndfile/>`__ package which can be installed via a system package manager. On Ubuntu it can
-be installed as follows: ``apt install libsndfile1-dev``
-
-
-- ASR and Speech Translation
-
-.. code-block::
-
-        >>> import torch
-        >>> from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
-        >>> from datasets import load_dataset
-        >>> import soundfile as sf
-
-        >>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
-        >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
-
-        >>> def map_to_array(batch):
-        ...     speech, _ = sf.read(batch["file"])
-        ...     batch["speech"] = speech
-        ...     return batch
-
-        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        >>> ds = ds.map(map_to_array)
-
-        >>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
-        >>> generated_ids = model.generate(input_ids=inputs["input_features"], attention_mask=inputs["attention_mask"])
-
-        >>> transcription = processor.batch_decode(generated_ids)
-
-
-- Multilingual speech translation
-
-    For multilingual speech translation models, :obj:`eos_token_id` is used as the :obj:`decoder_start_token_id` and
-    the target language id is forced as the first generated token. To force the target language id as the first
-    generated token, pass the :obj:`forced_bos_token_id` parameter to the :obj:`generate()` method. The following
-    example shows how to transate English speech to French text using the `facebook/s2t-medium-mustc-multilingual-st`
-    checkpoint.
-
-.. code-block::
-
-        >>> import torch
-        >>> from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
-        >>> from datasets import load_dataset
-        >>> import soundfile as sf
-
-        >>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
-        >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
-
-        >>> def map_to_array(batch):
-        ...     speech, _ = sf.read(batch["file"])
-        ...     batch["speech"] = speech
-        ...     return batch
-
-        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        >>> ds = ds.map(map_to_array)
-
-        >>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
-        >>> generated_ids = model.generate(input_ids=inputs["input_features"], attention_mask=inputs["attention_mask], forced_bos_token_id=processor.tokenizer.lang_code_to_id["fr"])
-
-        >>> translation = processor.batch_decode(generated_ids)
-
-
-See the `model hub <https://huggingface.co/models?filter=speech_to_text>`__ to look for Speech2Text checkpoints.
-
-
-Speech2TextConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Speech2TextConfig
-    :members:
-
-
-Speech2TextTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Speech2TextTokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-Speech2TextFeatureExtractor
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Speech2TextFeatureExtractor
-    :members: __call__
-
-
-Speech2TextProcessor
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Speech2TextProcessor
-    :members: __call__, from_pretrained, save_pretrained, batch_decode, decode, as_target_processor
-
-
-Speech2TextModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Speech2TextModel
-    :members: forward
-
-
-Speech2TextForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Speech2TextForConditionalGeneration
-    :members: forward
diff --git a/docs/source/model_doc/speech_to_text_2.mdx b/docs/source/model_doc/speech_to_text_2.mdx
new file mode 100644
index 0000000000..a95f1d0af9
--- /dev/null
+++ b/docs/source/model_doc/speech_to_text_2.mdx
@@ -0,0 +1,116 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Speech2Text2
+
+## Overview
+
+The Speech2Text2 model is used together with [Wav2Vec2](wav2vec2) for Speech Translation models proposed in
+[Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by
+Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+
+Speech2Text2 is a *decoder-only* transformer model that can be used with any speech *encoder-only*, such as
+[Wav2Vec2](wav2vec2) or [HuBERT](hubert) for Speech-to-Text tasks. Please refer to the
+[SpeechEncoderDecoder](speechencoderdecoder) class on how to combine Speech2Text2 with any speech *encoder-only*
+model.
+
+This model was contributed by [Patrick von Platen](https://huggingface.co/patrickvonplaten).
+
+The original code can be found [here](https://github.com/pytorch/fairseq/blob/1f7ef9ed1e1061f8c7f88f8b94c7186834398690/fairseq/models/wav2vec/wav2vec2_asr.py#L266).
+
+
+Tips:
+
+- Speech2Text2 achieves state-of-the-art results on the CoVoST Speech Translation dataset. For more information, see
+  the [official models](https://huggingface.co/models?other=speech2text2) .
+- Speech2Text2 is always used within the [SpeechEncoderDecoder](speechencoderdecoder) framework.
+- Speech2Text2's tokenizer is based on [fastBPE](https://github.com/glample/fastBPE).
+
+## Inference
+
+Speech2Text2's [`SpeechEncoderDecoderModel`] model accepts raw waveform input values from speech and
+makes use of [`~generation_utils.GenerationMixin.generate`] to translate the input speech
+autoregressively to the target language.
+
+The [`Wav2Vec2FeatureExtractor`] class is responsible for preprocessing the input speech and
+[`Speech2Text2Tokenizer`] decodes the generated target tokens to the target string. The
+[`Speech2Text2Processor`] wraps [`Wav2Vec2FeatureExtractor`] and
+[`Speech2Text2Tokenizer`] into a single instance to both extract the input features and decode the
+predicted token ids.
+
+- Step-by-step Speech Translation
+
+```python
+>>> import torch
+>>> from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel
+>>> from datasets import load_dataset
+>>> import soundfile as sf
+
+>>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
+>>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
+
+>>> def map_to_array(batch):
+...     speech, _ = sf.read(batch["file"])
+...     batch["speech"] = speech
+...     return batch
+
+>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+>>> ds = ds.map(map_to_array)
+
+>>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
+>>> generated_ids = model.generate(input_ids=inputs["input_values"], attention_mask=inputs["attention_mask"])
+
+>>> transcription = processor.batch_decode(generated_ids)
+```
+
+- Speech Translation via Pipelines
+
+  The automatic speech recognition pipeline can also be used to translate speech in just a couple lines of code
+
+```python
+>>> from datasets import load_dataset
+>>> from transformers import pipeline
+
+>>> librispeech_en = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+>>> asr = pipeline("automatic-speech-recognition", model="facebook/s2t-wav2vec2-large-en-de", feature_extractor="facebook/s2t-wav2vec2-large-en-de")
+
+>>> translation_de = asr(librispeech_en[0]["file"])
+```
+
+See [model hub](https://huggingface.co/models?filter=speech2text2) to look for Speech2Text2 checkpoints.
+
+
+## Speech2Text2Config
+
+[[autodoc]] Speech2Text2Config
+
+## Speech2TextTokenizer
+
+[[autodoc]] Speech2Text2Tokenizer
+    - batch_decode
+    - decode
+    - save_vocabulary
+
+## Speech2Text2Processor
+
+[[autodoc]] Speech2Text2Processor
+    - __call__
+    - from_pretrained
+    - save_pretrained
+    - batch_decode
+    - decode
+    - as_target_processor
+
+## Speech2Text2ForCausalLM
+
+[[autodoc]] Speech2Text2ForCausalLM
+    - forward
diff --git a/docs/source/model_doc/speech_to_text_2.rst b/docs/source/model_doc/speech_to_text_2.rst
deleted file mode 100644
index 2ab83c86ad..0000000000
--- a/docs/source/model_doc/speech_to_text_2.rst
+++ /dev/null
@@ -1,123 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Speech2Text2
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The Speech2Text2 model is used together with :doc:`Wav2Vec2 <wav2vec2>` for Speech Translation models proposed in
-`Large-Scale Self- and Semi-Supervised Learning for Speech Translation <https://arxiv.org/abs/2104.06678>`__ by
-Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
-
-Speech2Text2 is a *decoder-only* transformer model that can be used with any speech *encoder-only*, such as
-:doc:`Wav2Vec2 <wav2vec2>` or :doc:`HuBERT <hubert>` for Speech-to-Text tasks. Please refer to the
-:doc:`SpeechEncoderDecoder <speechencoderdecoder>` class on how to combine Speech2Text2 with any speech *encoder-only*
-model.
-
-This model was contributed by `Patrick von Platen <https://huggingface.co/patrickvonplaten>`__.
-
-The original code can be found `here
-<https://github.com/pytorch/fairseq/blob/1f7ef9ed1e1061f8c7f88f8b94c7186834398690/fairseq/models/wav2vec/wav2vec2_asr.py#L266>`__.
-
-
-Tips:
-
-- Speech2Text2 achieves state-of-the-art results on the CoVoST Speech Translation dataset. For more information, see
-  the `official models <https://huggingface.co/models?other=speech2text2>`__ .
-- Speech2Text2 is always used within the :doc:`SpeechEncoderDecoder <speechencoderdecoder>` framework.
-- Speech2Text2's tokenizer is based on `fastBPE <https://github.com/glample/fastBPE>`.
-
-Inference
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Speech2Text2's :class:`~transformers.SpeechEncoderDecoderModel` model accepts raw waveform input values from speech and
-makes use of :func:`~transformers.generation_utils.GenerationMixin.generate` to translate the input speech
-autoregressively to the target language.
-
-The :class:`~transformers.Wav2Vec2FeatureExtractor` class is responsible for preprocessing the input speech and
-:class:`~transformers.Speech2Text2Tokenizer` decodes the generated target tokens to the target string. The
-:class:`~transformers.Speech2Text2Processor` wraps :class:`~transformers.Wav2Vec2FeatureExtractor` and
-:class:`~transformers.Speech2Text2Tokenizer` into a single instance to both extract the input features and decode the
-predicted token ids.
-
-- Step-by-step Speech Translation
-
-.. code-block::
-
-        >>> import torch
-        >>> from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel
-        >>> from datasets import load_dataset
-        >>> import soundfile as sf
-
-        >>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
-        >>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
-
-        >>> def map_to_array(batch):
-        ...     speech, _ = sf.read(batch["file"])
-        ...     batch["speech"] = speech
-        ...     return batch
-
-        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        >>> ds = ds.map(map_to_array)
-
-        >>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
-        >>> generated_ids = model.generate(input_ids=inputs["input_values"], attention_mask=inputs["attention_mask"])
-
-        >>> transcription = processor.batch_decode(generated_ids)
-
-
-- Speech Translation via Pipelines
-
-    The automatic speech recognition pipeline can also be used to translate speech in just a couple lines of code
-
-.. code-block::
-
-        >>> from datasets import load_dataset
-        >>> from transformers import pipeline
-
-        >>> librispeech_en = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        >>> asr = pipeline("automatic-speech-recognition", model="facebook/s2t-wav2vec2-large-en-de", feature_extractor="facebook/s2t-wav2vec2-large-en-de")
-
-        >>> translation_de = asr(librispeech_en[0]["file"])
-
-
-See `model hub <https://huggingface.co/models?filter=speech2text2>`__ to look for Speech2Text2 checkpoints.
-
-
-Speech2Text2Config
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Speech2Text2Config
-    :members:
-
-
-Speech2TextTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Speech2Text2Tokenizer
-    :members: batch_decode, decode, save_vocabulary
-
-
-Speech2Text2Processor
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Speech2Text2Processor
-    :members: __call__, from_pretrained, save_pretrained, batch_decode, decode, as_target_processor
-
-
-Speech2Text2ForCausalLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Speech2Text2ForCausalLM
-    :members: forward
diff --git a/docs/source/model_doc/speechencoderdecoder.mdx b/docs/source/model_doc/speechencoderdecoder.mdx
new file mode 100644
index 0000000000..7004d5250f
--- /dev/null
+++ b/docs/source/model_doc/speechencoderdecoder.mdx
@@ -0,0 +1,35 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Speech Encoder Decoder Models
+
+The [`SpeechEncoderDecoderModel`] can be used to initialize a speech-sequence-to-text-sequence model
+with any pretrained speech autoencoding model as the encoder (*e.g.* [Wav2Vec2](wav2vec2), [Hubert](hubert)) and any pretrained autoregressive model as the decoder.
+
+The effectiveness of initializing speech-sequence-to-text-sequence models with pretrained checkpoints for speech
+recognition and speech translation has *e.g.* been shown in [Large-Scale Self- and Semi-Supervised Learning for Speech
+Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli,
+Alexis Conneau.
+
+An example of how to use a [`SpeechEncoderDecoderModel`] for inference can be seen in
+[Speech2Text2](speech_to_text_2).
+
+
+## SpeechEncoderDecoderConfig
+
+[[autodoc]] SpeechEncoderDecoderConfig
+
+## SpeechEncoderDecoderModel
+
+[[autodoc]] SpeechEncoderDecoderModel
+    - forward
+    - from_encoder_decoder_pretrained
diff --git a/docs/source/model_doc/speechencoderdecoder.rst b/docs/source/model_doc/speechencoderdecoder.rst
deleted file mode 100644
index c0773e69f6..0000000000
--- a/docs/source/model_doc/speechencoderdecoder.rst
+++ /dev/null
@@ -1,40 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Speech Encoder Decoder Models
------------------------------------------------------------------------------------------------------------------------
-
-The :class:`~transformers.SpeechEncoderDecoderModel` can be used to initialize a speech-sequence-to-text-sequence model
-with any pretrained speech autoencoding model as the encoder (*e.g.* :doc:`Wav2Vec2 <wav2vec2>`, :doc:`Hubert
-<hubert>`) and any pretrained autoregressive model as the decoder.
-
-The effectiveness of initializing speech-sequence-to-text-sequence models with pretrained checkpoints for speech
-recognition and speech translation has *e.g.* been shown in `Large-Scale Self- and Semi-Supervised Learning for Speech
-Translation <https://arxiv.org/abs/2104.06678>`__ by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli,
-Alexis Conneau.
-
-An example of how to use a :class:`~transformers.SpeechEncoderDecoderModel` for inference can be seen in
-:doc:`Speech2Text2 <speech_to_text_2>`.
-
-
-SpeechEncoderDecoderConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SpeechEncoderDecoderConfig
-    :members:
-
-
-SpeechEncoderDecoderModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SpeechEncoderDecoderModel
-    :members: forward, from_encoder_decoder_pretrained
diff --git a/docs/source/model_doc/splinter.mdx b/docs/source/model_doc/splinter.mdx
new file mode 100644
index 0000000000..50d4e8db74
--- /dev/null
+++ b/docs/source/model_doc/splinter.mdx
@@ -0,0 +1,74 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Splinter
+
+## Overview
+
+The Splinter model was proposed in [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy. Splinter
+is an encoder-only transformer (similar to BERT) pretrained using the recurring span selection task on a large corpus
+comprising Wikipedia and the Toronto Book Corpus.
+
+The abstract from the paper is the following:
+
+In several question answering benchmarks, pretrained models have reached human parity through fine-tuning on an order
+of 100,000 annotated questions and answers. We explore the more realistic few-shot setting, where only a few hundred
+training examples are available, and observe that standard models perform poorly, highlighting the discrepancy between
+current pretraining objectives and question answering. We propose a new pretraining scheme tailored for question
+answering: recurring span selection. Given a passage with multiple sets of recurring spans, we mask in each set all
+recurring spans but one, and ask the model to select the correct span in the passage for each masked span. Masked spans
+are replaced with a special token, viewed as a question representation, that is later used during fine-tuning to select
+the answer span. The resulting model obtains surprisingly good results on multiple benchmarks (e.g., 72.7 F1 on SQuAD
+with only 128 training examples), while maintaining competitive performance in the high-resource setting.
+
+Tips:
+
+- Splinter was trained to predict answers spans conditioned on a special [QUESTION] token. These tokens contextualize
+  to question representations which are used to predict the answers. This layer is called QASS, and is the default
+  behaviour in the [`SplinterForQuestionAnswering`] class. Therefore:
+- Use [`SplinterTokenizer`] (rather than [`BertTokenizer`]), as it already
+  contains this special token. Also, its default behavior is to use this token when two sequences are given (for
+  example, in the *run_qa.py* script).
+- If you plan on using Splinter outside *run_qa.py*, please keep in mind the question token - it might be important for
+  the success of your model, especially in a few-shot setting.
+- Please note there are two different checkpoints for each size of Splinter. Both are basically the same, except that
+  one also has the pretrained wights of the QASS layer (*tau/splinter-base-qass* and *tau/splinter-large-qass*) and one
+  doesn't (*tau/splinter-base* and *tau/splinter-large*). This is done to support randomly initializing this layer at
+  fine-tuning, as it is shown to yield better results for some cases in the paper.
+
+This model was contributed by [yuvalkirstain](https://huggingface.co/yuvalkirstain) and [oriram](https://huggingface.co/oriram). The original code can be found [here](https://github.com/oriram/splinter).
+
+## SplinterConfig
+
+[[autodoc]] SplinterConfig
+
+## SplinterTokenizer
+
+[[autodoc]] SplinterTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## SplinterTokenizerFast
+
+[[autodoc]] SplinterTokenizerFast
+
+## SplinterModel
+
+[[autodoc]] SplinterModel
+    - forward
+
+## SplinterForQuestionAnswering
+
+[[autodoc]] SplinterForQuestionAnswering
+    - forward
diff --git a/docs/source/model_doc/splinter.rst b/docs/source/model_doc/splinter.rst
deleted file mode 100644
index a57ab436cc..0000000000
--- a/docs/source/model_doc/splinter.rst
+++ /dev/null
@@ -1,87 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Splinter
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The Splinter model was proposed in `Few-Shot Question Answering by Pretraining Span Selection
-<https://arxiv.org/abs/2101.00438>`__ by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy. Splinter
-is an encoder-only transformer (similar to BERT) pretrained using the recurring span selection task on a large corpus
-comprising Wikipedia and the Toronto Book Corpus.
-
-The abstract from the paper is the following:
-
-In several question answering benchmarks, pretrained models have reached human parity through fine-tuning on an order
-of 100,000 annotated questions and answers. We explore the more realistic few-shot setting, where only a few hundred
-training examples are available, and observe that standard models perform poorly, highlighting the discrepancy between
-current pretraining objectives and question answering. We propose a new pretraining scheme tailored for question
-answering: recurring span selection. Given a passage with multiple sets of recurring spans, we mask in each set all
-recurring spans but one, and ask the model to select the correct span in the passage for each masked span. Masked spans
-are replaced with a special token, viewed as a question representation, that is later used during fine-tuning to select
-the answer span. The resulting model obtains surprisingly good results on multiple benchmarks (e.g., 72.7 F1 on SQuAD
-with only 128 training examples), while maintaining competitive performance in the high-resource setting.
-
-Tips:
-
-- Splinter was trained to predict answers spans conditioned on a special [QUESTION] token. These tokens contextualize
-  to question representations which are used to predict the answers. This layer is called QASS, and is the default
-  behaviour in the :class:`~transformers.SplinterForQuestionAnswering` class. Therefore:
-- Use :class:`~transformers.SplinterTokenizer` (rather than :class:`~transformers.BertTokenizer`), as it already
-  contains this special token. Also, its default behavior is to use this token when two sequences are given (for
-  example, in the `run_qa.py` script).
-- If you plan on using Splinter outside `run_qa.py`, please keep in mind the question token - it might be important for
-  the success of your model, especially in a few-shot setting.
-- Please note there are two different checkpoints for each size of Splinter. Both are basically the same, except that
-  one also has the pretrained wights of the QASS layer (`tau/splinter-base-qass` and `tau/splinter-large-qass`) and one
-  doesn't (`tau/splinter-base` and `tau/splinter-large`). This is done to support randomly initializing this layer at
-  fine-tuning, as it is shown to yield better results for some cases in the paper.
-
-This model was contributed by `yuvalkirstain <https://huggingface.co/yuvalkirstain>`__ and `oriram
-<https://huggingface.co/oriram>`__. The original code can be found `here <https://github.com/oriram/splinter>`__.
-
-SplinterConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SplinterConfig
-    :members:
-
-
-SplinterTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SplinterTokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-SplinterTokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SplinterTokenizerFast
-    :members:
-
-
-SplinterModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SplinterModel
-    :members: forward
-
-
-SplinterForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SplinterForQuestionAnswering
-    :members: forward
diff --git a/docs/source/model_doc/squeezebert.mdx b/docs/source/model_doc/squeezebert.mdx
new file mode 100644
index 0000000000..c6219582c8
--- /dev/null
+++ b/docs/source/model_doc/squeezebert.mdx
@@ -0,0 +1,88 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# SqueezeBERT
+
+## Overview
+
+The SqueezeBERT model was proposed in [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, Kurt W. Keutzer. It's a
+bidirectional transformer similar to the BERT model. The key difference between the BERT architecture and the
+SqueezeBERT architecture is that SqueezeBERT uses [grouped convolutions](https://blog.yani.io/filter-group-tutorial)
+instead of fully-connected layers for the Q, K, V and FFN layers.
+
+The abstract from the paper is the following:
+
+*Humans read and write hundreds of billions of messages every day. Further, due to the availability of large datasets,
+large computing systems, and better neural network models, natural language processing (NLP) technology has made
+significant strides in understanding, proofreading, and organizing these messages. Thus, there is a significant
+opportunity to deploy NLP in myriad applications to help web users, social networks, and businesses. In particular, we
+consider smartphones and other mobile devices as crucial platforms for deploying NLP models at scale. However, today's
+highly-accurate NLP neural network models such as BERT and RoBERTa are extremely computationally expensive, with
+BERT-base taking 1.7 seconds to classify a text snippet on a Pixel 3 smartphone. In this work, we observe that methods
+such as grouped convolutions have yielded significant speedups for computer vision networks, but many of these
+techniques have not been adopted by NLP neural network designers. We demonstrate how to replace several operations in
+self-attention layers with grouped convolutions, and we use this technique in a novel network architecture called
+SqueezeBERT, which runs 4.3x faster than BERT-base on the Pixel 3 while achieving competitive accuracy on the GLUE test
+set. The SqueezeBERT code will be released.*
+
+Tips:
+
+- SqueezeBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
+  rather than the left.
+- SqueezeBERT is similar to BERT and therefore relies on the masked language modeling (MLM) objective. It is therefore
+  efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation. Models trained
+  with a causal language modeling (CLM) objective are better in that regard.
+- For best results when finetuning on sequence classification tasks, it is recommended to start with the
+  *squeezebert/squeezebert-mnli-headless* checkpoint.
+
+This model was contributed by [forresti](https://huggingface.co/forresti).
+
+
+## SqueezeBertConfig
+
+[[autodoc]] SqueezeBertConfig
+
+## SqueezeBertTokenizer
+
+[[autodoc]] SqueezeBertTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## SqueezeBertTokenizerFast
+
+[[autodoc]] SqueezeBertTokenizerFast
+
+## SqueezeBertModel
+
+[[autodoc]] SqueezeBertModel
+
+## SqueezeBertForMaskedLM
+
+[[autodoc]] SqueezeBertForMaskedLM
+
+## SqueezeBertForSequenceClassification
+
+[[autodoc]] SqueezeBertForSequenceClassification
+
+## SqueezeBertForMultipleChoice
+
+[[autodoc]] SqueezeBertForMultipleChoice
+
+## SqueezeBertForTokenClassification
+
+[[autodoc]] SqueezeBertForTokenClassification
+
+## SqueezeBertForQuestionAnswering
+
+[[autodoc]] SqueezeBertForQuestionAnswering
diff --git a/docs/source/model_doc/squeezebert.rst b/docs/source/model_doc/squeezebert.rst
deleted file mode 100644
index 9f70cd655b..0000000000
--- a/docs/source/model_doc/squeezebert.rst
+++ /dev/null
@@ -1,114 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-SqueezeBERT
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The SqueezeBERT model was proposed in `SqueezeBERT: What can computer vision teach NLP about efficient neural networks?
-<https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, Kurt W. Keutzer. It's a
-bidirectional transformer similar to the BERT model. The key difference between the BERT architecture and the
-SqueezeBERT architecture is that SqueezeBERT uses `grouped convolutions <https://blog.yani.io/filter-group-tutorial>`__
-instead of fully-connected layers for the Q, K, V and FFN layers.
-
-The abstract from the paper is the following:
-
-*Humans read and write hundreds of billions of messages every day. Further, due to the availability of large datasets,
-large computing systems, and better neural network models, natural language processing (NLP) technology has made
-significant strides in understanding, proofreading, and organizing these messages. Thus, there is a significant
-opportunity to deploy NLP in myriad applications to help web users, social networks, and businesses. In particular, we
-consider smartphones and other mobile devices as crucial platforms for deploying NLP models at scale. However, today's
-highly-accurate NLP neural network models such as BERT and RoBERTa are extremely computationally expensive, with
-BERT-base taking 1.7 seconds to classify a text snippet on a Pixel 3 smartphone. In this work, we observe that methods
-such as grouped convolutions have yielded significant speedups for computer vision networks, but many of these
-techniques have not been adopted by NLP neural network designers. We demonstrate how to replace several operations in
-self-attention layers with grouped convolutions, and we use this technique in a novel network architecture called
-SqueezeBERT, which runs 4.3x faster than BERT-base on the Pixel 3 while achieving competitive accuracy on the GLUE test
-set. The SqueezeBERT code will be released.*
-
-Tips:
-
-- SqueezeBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
-  rather than the left.
-- SqueezeBERT is similar to BERT and therefore relies on the masked language modeling (MLM) objective. It is therefore
-  efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation. Models trained
-  with a causal language modeling (CLM) objective are better in that regard.
-- For best results when finetuning on sequence classification tasks, it is recommended to start with the
-  `squeezebert/squeezebert-mnli-headless` checkpoint.
-
-This model was contributed by `forresti <https://huggingface.co/forresti>`__.
-
-
-SqueezeBertConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SqueezeBertConfig
-    :members:
-
-
-SqueezeBertTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SqueezeBertTokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-SqueezeBertTokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SqueezeBertTokenizerFast
-    :members:
-
-
-SqueezeBertModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SqueezeBertModel
-    :members:
-
-
-SqueezeBertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SqueezeBertForMaskedLM
-    :members:
-
-
-SqueezeBertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SqueezeBertForSequenceClassification
-    :members:
-
-
-SqueezeBertForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SqueezeBertForMultipleChoice
-    :members:
-
-
-SqueezeBertForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SqueezeBertForTokenClassification
-    :members:
-
-
-SqueezeBertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SqueezeBertForQuestionAnswering
-    :members:
diff --git a/docs/source/model_doc/t5.mdx b/docs/source/model_doc/t5.mdx
new file mode 100644
index 0000000000..5f1e00d5ba
--- /dev/null
+++ b/docs/source/model_doc/t5.mdx
@@ -0,0 +1,342 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# T5
+
+## Overview
+
+The T5 model was presented in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/pdf/1910.10683.pdf) by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang,
+Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu.
+
+The abstract from the paper is the following:
+
+*Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a downstream
+task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning
+has given rise to a diversity of approaches, methodology, and practice. In this paper, we explore the landscape of
+transfer learning techniques for NLP by introducing a unified framework that converts every language problem into a
+text-to-text format. Our systematic study compares pretraining objectives, architectures, unlabeled datasets, transfer
+approaches, and other factors on dozens of language understanding tasks. By combining the insights from our exploration
+with scale and our new "Colossal Clean Crawled Corpus", we achieve state-of-the-art results on many benchmarks covering
+summarization, question answering, text classification, and more. To facilitate future work on transfer learning for
+NLP, we release our dataset, pre-trained models, and code.*
+
+Tips:
+
+- T5 is an encoder-decoder model pre-trained on a multi-task mixture of unsupervised and supervised tasks and for which
+  each task is converted into a text-to-text format. T5 works well on a variety of tasks out-of-the-box by prepending a
+  different prefix to the input corresponding to each task, e.g., for translation: *translate English to German: ...*,
+  for summarization: *summarize: ...*.
+
+- T5 uses relative scalar embeddings. Encoder input padding can be done on the left and on the right.
+
+- See the [training](#training), [inference](#inference) and [scripts](#scripts) sections below for all details regarding usage.
+
+T5 comes in different sizes:
+
+- [t5-small](https://huggingface.co/t5-small)
+
+- [t5-base](https://huggingface.co/t5-base)
+
+- [t5-large](https://huggingface.co/t5-large)
+
+- [t5-3b](https://huggingface.co/t5-3b)
+
+- [t5-11b](https://huggingface.co/t5-11b).
+
+Based on the original T5 model, Google has released some follow-up works:
+
+- **T5v1.1**: T5v1.1 is an improved version of T5 with some architectural tweaks, and is pre-trained on C4 only without
+  mixing in the supervised tasks. Refer to the documentation of T5v1.1 which can be found [here](t5v1.1).
+
+- **mT5**: mT5 is a multilingual T5 model. It is pre-trained on the mC4 corpus, which includes 101 languages. Refer to
+  the documentation of mT5 which can be found [here](mt5).
+
+- **byT5**: byT5 is a T5 model pre-trained on byte sequences rather than SentencePiece subword token sequences. Refer
+  to the documentation of byT5 which can be found [here](byt5).
+
+All checkpoints can be found on the [hub](https://huggingface.co/models?search=t5).
+
+This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/google-research/text-to-text-transfer-transformer).
+
+<a id='training'></a>
+
+## Training
+
+T5 is an encoder-decoder model and converts all NLP problems into a text-to-text format. It is trained using teacher
+forcing. This means that for training, we always need an input sequence and a corresponding target sequence. The input
+sequence is fed to the model using `input_ids`. The target sequence is shifted to the right, i.e., prepended by a
+start-sequence token and fed to the decoder using the `decoder_input_ids`. In teacher-forcing style, the target
+sequence is then appended by the EOS token and corresponds to the `labels`. The PAD token is hereby used as the
+start-sequence token. T5 can be trained / fine-tuned both in a supervised and unsupervised fashion.
+
+One can use [`T5ForConditionalGeneration`] (or the Tensorflow/Flax variant), which includes the
+language modeling head on top of the decoder.
+
+- Unsupervised denoising training
+
+  In this setup, spans of the input sequence are masked by so-called sentinel tokens (*a.k.a* unique mask tokens) and
+  the output sequence is formed as a concatenation of the same sentinel tokens and the *real* masked tokens. Each
+  sentinel token represents a unique mask token for this sentence and should start with `<extra_id_0>`,
+  `<extra_id_1>`, ... up to `<extra_id_99>`. As a default, 100 sentinel tokens are available in
+  [`T5Tokenizer`].
+
+  For instance, the sentence "The cute dog walks in the park" with the masks put on "cute dog" and "the" should be
+  processed as follows:
+
+  ```python
+  from transformers import T5Tokenizer, T5ForConditionalGeneration
+
+  tokenizer = T5Tokenizer.from_pretrained("t5-small")
+  model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+  input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
+  labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
+  # the forward function automatically creates the correct decoder_input_ids
+  loss = model(input_ids=input_ids, labels=labels).loss
+  ```
+
+  If you're interested in pre-training T5 on a new corpus, check out the [run_t5_mlm_flax.py](https://github.com/huggingface/transformers/tree/master/examples/flax/language-modeling) script in the Examples
+  directory.
+
+- Supervised training
+
+  In this setup, the input sequence and output sequence are a standard sequence-to-sequence input-output mapping.
+  Suppose that we want to fine-tune the model for translation for example, and we have a training example: the input
+  sequence "The house is wonderful." and output sequence "Das Haus ist wunderbar.", then they should be prepared for
+  the model as follows:
+
+  ```python
+  from transformers import T5Tokenizer, T5ForConditionalGeneration
+
+  tokenizer = T5Tokenizer.from_pretrained("t5-small")
+  model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+  input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
+  labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids
+  # the forward function automatically creates the correct decoder_input_ids
+  loss = model(input_ids=input_ids, labels=labels).loss
+  ```
+
+  As you can see, only 2 inputs are required for the model in order to compute a loss: `input_ids` (which are the
+  `input_ids` of the encoded input sequence) and `labels` (which are the `input_ids` of the encoded
+  target sequence). The model will automatically create the `decoder_input_ids` based on the `labels`, by
+  shifting them one position to the right and prepending the `config.decoder_start_token_id`, which for T5 is
+  equal to 0 (i.e. the id of the pad token). Also note the task prefix: we prepend the input sequence with 'translate
+  English to German: ' before encoding it. This will help in improving the performance, as this task prefix was used
+  during T5's pre-training.
+
+  However, the example above only shows a single training example. In practice, one trains deep learning models in
+  batches. This entails that we must pad/truncate examples to the same length. For encoder-decoder models, one
+  typically defines a `max_source_length` and `max_target_length`, which determine the maximum length of the
+  input and output sequences respectively (otherwise they are truncated). These should be carefully set depending on
+  the task.
+
+  In addition, we must make sure that padding token id's of the `labels` are not taken into account by the loss
+  function. In PyTorch and Tensorflow, this can be done by replacing them with -100, which is the `ignore_index`
+  of the `CrossEntropyLoss`. In Flax, one can use the `decoder_attention_mask` to ignore padded tokens from
+  the loss (see the [Flax summarization script](https://github.com/huggingface/transformers/tree/master/examples/flax/summarization) for details). We also pass
+  `attention_mask` as additional input to the model, which makes sure that padding tokens of the inputs are
+  ignored. The code example below illustrates all of this.
+
+  ```python
+  from transformers import T5Tokenizer, T5ForConditionalGeneration 
+  import torch
+
+  tokenizer = T5Tokenizer.from_pretrained("t5-small")
+  model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+  # the following 2 hyperparameters are task-specific
+  max_source_length = 512
+  max_target_length = 128
+
+  # Suppose we have the following 2 training examples:
+  input_sequence_1 = "Welcome to NYC"
+  output_sequence_1 = "Bienvenue à NYC"
+
+  input_sequence_2 = "HuggingFace is a company"
+  output_sequence_2 = "HuggingFace est une entreprise"
+
+  # encode the inputs
+  task_prefix = "translate English to French: "
+  input_sequences = [input_sequence_1, input_sequence_2]
+  encoding = tokenizer([task_prefix + sequence for sequence in input_sequences], 
+                      padding='longest', 
+                      max_length=max_source_length, 
+                      truncation=True, 
+                      return_tensors="pt")
+  input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
+
+  # encode the targets
+  target_encoding = tokenizer([output_sequence_1, output_sequence_2], 
+                              padding='longest', 
+                              max_length=max_target_length, 
+                              truncation=True)
+  labels = target_encoding.input_ids
+
+  # replace padding token id's of the labels by -100
+  labels = torch.tensor(labels)
+  labels[labels == tokenizer.pad_token_id] = -100
+
+  # forward pass
+  loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
+  ```
+
+Additional training tips:
+
+- T5 models need a slightly higher learning rate than the default one set in the `Trainer` when using the AdamW
+  optimizer. Typically, 1e-4 and 3e-4 work well for most problems (classification, summarization, translation, question
+  answering, question generation). Note that T5 was pre-trained using the AdaFactor optimizer.
+
+- According to [this forum post](https://discuss.huggingface.co/t/t5-finetuning-tips/684), task prefixes matter when
+  (1) doing multi-task training (2) your task is similar or related to one of the supervised tasks used in T5's
+  pre-training mixture (see Appendix D of the [paper](https://arxiv.org/pdf/1910.10683.pdf) for the task prefixes
+  used).
+
+- If training on TPU, it is recommended to pad all examples of the dataset to the same length or make use of
+  *pad_to_multiple_of* to have a small number of predefined bucket sizes to fit all examples in. Dynamically padding
+  batches to the longest example is not recommended on TPU as it triggers a recompilation for every batch shape that is
+  encountered during training thus significantly slowing down the training. only padding up to the longest example in a
+  batch) leads to very slow training on TPU.
+
+<a id='inference'></a>
+
+## Inference
+
+At inference time, it is recommended to use [`~generation_utils.GenerationMixin.generate`]. This
+method takes care of encoding the input and feeding the encoded hidden states via cross-attention layers to the decoder
+and auto-regressively generates the decoder output. Check out [this blog post](https://huggingface.co/blog/how-to-generate) to know all the details about generating text with Transformers.
+There's also [this blog post](https://huggingface.co/blog/encoder-decoder#encoder-decoder) which explains how
+generation works in general in encoder-decoder models.
+
+```python
+from transformers import T5Tokenizer, T5ForConditionalGeneration 
+
+tokenizer = T5Tokenizer.from_pretrained("t5-small")
+model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
+outputs = model.generate(input_ids)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+# Das Haus ist wunderbar.
+```
+
+Note that T5 uses the `pad_token_id` as the `decoder_start_token_id`, so when doing generation without using
+[`~generation_utils.GenerationMixin.generate`], make sure you start it with the `pad_token_id`.
+
+The example above only shows a single example. You can also do batched inference, like so:
+
+```python
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+
+tokenizer = T5Tokenizer.from_pretrained("t5-small")
+model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+# when generating, we will use the logits of right-most token to predict the next token
+# so the padding should be on the left
+tokenizer.padding_side = "left" 
+tokenizer.pad_token = tokenizer.eos_token # to avoid an error
+
+task_prefix = 'translate English to German: '
+sentences = ['The house is wonderful.', 'I like to work in NYC.'] # use different length sentences to test batching
+inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True)
+
+output_sequences = model.generate(
+    input_ids=inputs['input_ids'],
+    attention_mask=inputs['attention_mask'],
+    do_sample=False, # disable sampling to test if batching affects output
+)
+
+print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))
+
+# ['Das Haus ist wunderbar.', 'Ich arbeite gerne in NYC.']
+```
+
+<a id='scripts'></a>
+
+## Example scripts
+
+T5 is supported by several example scripts, both for pre-training and fine-tuning.
+
+- pre-training: the [run_t5_mlm_flax.py](https://github.com/huggingface/transformers/blob/master/examples/flax/language-modeling/run_t5_mlm_flax.py)
+  script allows you to further pre-train T5 or pre-train T5 from scratch on your own data. The [t5_tokenizer_model.py](https://github.com/huggingface/transformers/blob/master/examples/flax/language-modeling/t5_tokenizer_model.py)
+  script allows you to further train a T5 tokenizer or train a T5 Tokenizer from scratch on your own data. Note that
+  Flax (a neural network library on top of JAX) is particularly useful to train on TPU hardware.
+
+- fine-tuning: T5 is supported by the official summarization scripts ([PyTorch](https://github.com/huggingface/transformers/tree/master/examples/pytorch/summarization), [Tensorflow](https://github.com/huggingface/transformers/tree/master/examples/tensorflow/summarization), and [Flax](https://github.com/huggingface/transformers/tree/master/examples/flax/summarization)) and translation scripts
+  ([PyTorch](https://github.com/huggingface/transformers/tree/master/examples/pytorch/translation) and [Tensorflow](https://github.com/huggingface/transformers/tree/master/examples/tensorflow/translation)). These scripts allow
+  you to easily fine-tune T5 on custom data for summarization/translation.
+
+## T5Config
+
+[[autodoc]] T5Config
+
+## T5Tokenizer
+
+[[autodoc]] T5Tokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## T5TokenizerFast
+
+[[autodoc]] T5TokenizerFast
+
+## T5Model
+
+[[autodoc]] T5Model
+    - forward
+    - parallelize
+    - deparallelize
+
+## T5ForConditionalGeneration
+
+[[autodoc]] T5ForConditionalGeneration
+    - forward
+    - parallelize
+    - deparallelize
+
+## T5EncoderModel
+
+[[autodoc]] T5EncoderModel
+    - forward
+    - parallelize
+    - deparallelize
+
+## TFT5Model
+
+[[autodoc]] TFT5Model
+    - call
+
+## TFT5ForConditionalGeneration
+
+[[autodoc]] TFT5ForConditionalGeneration
+    - call
+
+## TFT5EncoderModel
+
+[[autodoc]] TFT5EncoderModel
+    - call
+
+## FlaxT5Model
+
+[[autodoc]] FlaxT5Model
+    - __call__
+    - encode
+    - decode
+
+## FlaxT5ForConditionalGeneration
+
+[[autodoc]] FlaxT5ForConditionalGeneration
+    - __call__
+    - encode
+    - decode
diff --git a/docs/source/model_doc/t5.rst b/docs/source/model_doc/t5.rst
deleted file mode 100644
index 19edd137a5..0000000000
--- a/docs/source/model_doc/t5.rst
+++ /dev/null
@@ -1,364 +0,0 @@
-..
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-T5
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The T5 model was presented in `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer
-<https://arxiv.org/pdf/1910.10683.pdf>`_ by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang,
-Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu.
-
-The abstract from the paper is the following:
-
-*Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a downstream
-task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning
-has given rise to a diversity of approaches, methodology, and practice. In this paper, we explore the landscape of
-transfer learning techniques for NLP by introducing a unified framework that converts every language problem into a
-text-to-text format. Our systematic study compares pretraining objectives, architectures, unlabeled datasets, transfer
-approaches, and other factors on dozens of language understanding tasks. By combining the insights from our exploration
-with scale and our new "Colossal Clean Crawled Corpus", we achieve state-of-the-art results on many benchmarks covering
-summarization, question answering, text classification, and more. To facilitate future work on transfer learning for
-NLP, we release our dataset, pre-trained models, and code.*
-
-Tips:
-
-- T5 is an encoder-decoder model pre-trained on a multi-task mixture of unsupervised and supervised tasks and for which
-  each task is converted into a text-to-text format. T5 works well on a variety of tasks out-of-the-box by prepending a
-  different prefix to the input corresponding to each task, e.g., for translation: *translate English to German: ...*,
-  for summarization: *summarize: ...*.
-
-- T5 uses relative scalar embeddings. Encoder input padding can be done on the left and on the right.
-
-- See the :ref:`training`, :ref:`inference` and :ref:`scripts` sections below for all details regarding usage.
-
-T5 comes in different sizes:
-
-- `t5-small <https://huggingface.co/t5-small>`__
-
-- `t5-base <https://huggingface.co/t5-base>`__
-
-- `t5-large <https://huggingface.co/t5-large>`__
-
-- `t5-3b <https://huggingface.co/t5-3b>`__
-
-- `t5-11b <https://huggingface.co/t5-11b>`__.
-
-Based on the original T5 model, Google has released some follow-up works:
-
-- **T5v1.1**: T5v1.1 is an improved version of T5 with some architectural tweaks, and is pre-trained on C4 only without
-  mixing in the supervised tasks. Refer to the documentation of T5v1.1 which can be found :doc:`here <t5v1.1>`.
-
-- **mT5**: mT5 is a multilingual T5 model. It is pre-trained on the mC4 corpus, which includes 101 languages. Refer to
-  the documentation of mT5 which can be found :doc:`here <mt5>`.
-
-- **byT5**: byT5 is a T5 model pre-trained on byte sequences rather than SentencePiece subword token sequences. Refer
-  to the documentation of byT5 which can be found :doc:`here <byt5>`.
-
-All checkpoints can be found on the `hub <https://huggingface.co/models?search=t5>`__.
-
-This model was contributed by `thomwolf <https://huggingface.co/thomwolf>`__. The original code can be found `here
-<https://github.com/google-research/text-to-text-transfer-transformer>`__.
-
-.. _training:
-
-Training
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-T5 is an encoder-decoder model and converts all NLP problems into a text-to-text format. It is trained using teacher
-forcing. This means that for training, we always need an input sequence and a corresponding target sequence. The input
-sequence is fed to the model using :obj:`input_ids`. The target sequence is shifted to the right, i.e., prepended by a
-start-sequence token and fed to the decoder using the :obj:`decoder_input_ids`. In teacher-forcing style, the target
-sequence is then appended by the EOS token and corresponds to the :obj:`labels`. The PAD token is hereby used as the
-start-sequence token. T5 can be trained / fine-tuned both in a supervised and unsupervised fashion.
-
-One can use :class:`~transformers.T5ForConditionalGeneration` (or the Tensorflow/Flax variant), which includes the
-language modeling head on top of the decoder.
-
-- Unsupervised denoising training
-
-  In this setup, spans of the input sequence are masked by so-called sentinel tokens (*a.k.a* unique mask tokens) and
-  the output sequence is formed as a concatenation of the same sentinel tokens and the *real* masked tokens. Each
-  sentinel token represents a unique mask token for this sentence and should start with :obj:`<extra_id_0>`,
-  :obj:`<extra_id_1>`, ... up to :obj:`<extra_id_99>`. As a default, 100 sentinel tokens are available in
-  :class:`~transformers.T5Tokenizer`.
-
-  For instance, the sentence "The cute dog walks in the park" with the masks put on "cute dog" and "the" should be
-  processed as follows:
-
-    .. code-block::
-
-        from transformers import T5Tokenizer, T5ForConditionalGeneration
-
-        tokenizer = T5Tokenizer.from_pretrained("t5-small")
-        model = T5ForConditionalGeneration.from_pretrained("t5-small")
-
-        input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
-        labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
-        # the forward function automatically creates the correct decoder_input_ids
-        loss = model(input_ids=input_ids, labels=labels).loss
-
-  If you're interested in pre-training T5 on a new corpus, check out the `run_t5_mlm_flax.py
-  <https://github.com/huggingface/transformers/tree/master/examples/flax/language-modeling>`__ script in the Examples
-  directory.
-
-- Supervised training
-
-  In this setup, the input sequence and output sequence are a standard sequence-to-sequence input-output mapping.
-  Suppose that we want to fine-tune the model for translation for example, and we have a training example: the input
-  sequence "The house is wonderful." and output sequence "Das Haus ist wunderbar.", then they should be prepared for
-  the model as follows:
-
-    .. code-block::
-
-        from transformers import T5Tokenizer, T5ForConditionalGeneration
-
-        tokenizer = T5Tokenizer.from_pretrained("t5-small")
-        model = T5ForConditionalGeneration.from_pretrained("t5-small")
-
-        input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
-        labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids
-        # the forward function automatically creates the correct decoder_input_ids
-        loss = model(input_ids=input_ids, labels=labels).loss
-
-  As you can see, only 2 inputs are required for the model in order to compute a loss: :obj:`input_ids` (which are the
-  :obj:`input_ids` of the encoded input sequence) and :obj:`labels` (which are the :obj:`input_ids` of the encoded
-  target sequence). The model will automatically create the :obj:`decoder_input_ids` based on the :obj:`labels`, by
-  shifting them one position to the right and prepending the :obj:`config.decoder_start_token_id`, which for T5 is
-  equal to 0 (i.e. the id of the pad token). Also note the task prefix: we prepend the input sequence with 'translate
-  English to German: ' before encoding it. This will help in improving the performance, as this task prefix was used
-  during T5's pre-training.
-
-  However, the example above only shows a single training example. In practice, one trains deep learning models in
-  batches. This entails that we must pad/truncate examples to the same length. For encoder-decoder models, one
-  typically defines a :obj:`max_source_length` and :obj:`max_target_length`, which determine the maximum length of the
-  input and output sequences respectively (otherwise they are truncated). These should be carefully set depending on
-  the task.
-
-  In addition, we must make sure that padding token id's of the :obj:`labels` are not taken into account by the loss
-  function. In PyTorch and Tensorflow, this can be done by replacing them with -100, which is the :obj:`ignore_index`
-  of the :obj:`CrossEntropyLoss`. In Flax, one can use the :obj:`decoder_attention_mask` to ignore padded tokens from
-  the loss (see the `Flax summarization script
-  <https://github.com/huggingface/transformers/tree/master/examples/flax/summarization>`__ for details). We also pass
-  :obj:`attention_mask` as additional input to the model, which makes sure that padding tokens of the inputs are
-  ignored. The code example below illustrates all of this.
-
-    .. code-block::
-
-        from transformers import T5Tokenizer, T5ForConditionalGeneration 
-        import torch
-
-        tokenizer = T5Tokenizer.from_pretrained("t5-small")
-        model = T5ForConditionalGeneration.from_pretrained("t5-small")
-
-        # the following 2 hyperparameters are task-specific
-        max_source_length = 512
-        max_target_length = 128
-
-        # Suppose we have the following 2 training examples:
-        input_sequence_1 = "Welcome to NYC"
-        output_sequence_1 = "Bienvenue à NYC"
-
-        input_sequence_2 = "HuggingFace is a company"
-        output_sequence_2 = "HuggingFace est une entreprise"
-
-        # encode the inputs
-        task_prefix = "translate English to French: "
-        input_sequences = [input_sequence_1, input_sequence_2]
-        encoding = tokenizer([task_prefix + sequence for sequence in input_sequences], 
-                             padding='longest', 
-                             max_length=max_source_length, 
-                             truncation=True, 
-                             return_tensors="pt")
-        input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
-
-        # encode the targets
-        target_encoding = tokenizer([output_sequence_1, output_sequence_2], 
-                                    padding='longest', 
-                                    max_length=max_target_length, 
-                                    truncation=True)
-        labels = target_encoding.input_ids
-
-        # replace padding token id's of the labels by -100
-        labels = torch.tensor(labels)
-        labels[labels == tokenizer.pad_token_id] = -100
-
-        # forward pass
-        loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
-
-Additional training tips:
-
-- T5 models need a slightly higher learning rate than the default one set in the :obj:`Trainer` when using the AdamW
-  optimizer. Typically, 1e-4 and 3e-4 work well for most problems (classification, summarization, translation, question
-  answering, question generation). Note that T5 was pre-trained using the AdaFactor optimizer.
-
-- According to `this forum post <https://discuss.huggingface.co/t/t5-finetuning-tips/684>`__, task prefixes matter when
-  (1) doing multi-task training (2) your task is similar or related to one of the supervised tasks used in T5's
-  pre-training mixture (see Appendix D of the `paper <https://arxiv.org/pdf/1910.10683.pdf>`__ for the task prefixes
-  used).
-
-- If training on TPU, it is recommended to pad all examples of the dataset to the same length or make use of
-  `pad_to_multiple_of` to have a small number of predefined bucket sizes to fit all examples in. Dynamically padding
-  batches to the longest example is not recommended on TPU as it triggers a recompilation for every batch shape that is
-  encountered during training thus significantly slowing down the training. only padding up to the longest example in a
-  batch) leads to very slow training on TPU.
-
-.. _inference:
-
-Inference
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-At inference time, it is recommended to use :meth:`~transformers.generation_utils.GenerationMixin.generate`. This
-method takes care of encoding the input and feeding the encoded hidden states via cross-attention layers to the decoder
-and auto-regressively generates the decoder output. Check out `this blog post
-<https://huggingface.co/blog/how-to-generate>`__ to know all the details about generating text with Transformers.
-There's also `this blog post <https://huggingface.co/blog/encoder-decoder#encoder-decoder>`__ which explains how
-generation works in general in encoder-decoder models.
-
-.. code-block::
-
-        from transformers import T5Tokenizer, T5ForConditionalGeneration 
-
-        tokenizer = T5Tokenizer.from_pretrained("t5-small")
-        model = T5ForConditionalGeneration.from_pretrained("t5-small")
-
-        input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
-        outputs = model.generate(input_ids)
-        print(tokenizer.decode(outputs[0], skip_special_tokens=True))
-        # Das Haus ist wunderbar.
-
-Note that T5 uses the :obj:`pad_token_id` as the :obj:`decoder_start_token_id`, so when doing generation without using
-:meth:`~transformers.generation_utils.GenerationMixin.generate`, make sure you start it with the :obj:`pad_token_id`.
-
-The example above only shows a single example. You can also do batched inference, like so:
-
-.. code-block::
-
-        from transformers import T5Tokenizer, T5ForConditionalGeneration
-
-        tokenizer = T5Tokenizer.from_pretrained("t5-small")
-        model = T5ForConditionalGeneration.from_pretrained("t5-small")
-
-        # when generating, we will use the logits of right-most token to predict the next token
-        # so the padding should be on the left
-        tokenizer.padding_side = "left" 
-        tokenizer.pad_token = tokenizer.eos_token # to avoid an error
-
-        task_prefix = 'translate English to German: '
-        sentences = ['The house is wonderful.', 'I like to work in NYC.'] # use different length sentences to test batching
-        inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True)
-
-        output_sequences = model.generate(
-            input_ids=inputs['input_ids'],
-            attention_mask=inputs['attention_mask'],
-            do_sample=False, # disable sampling to test if batching affects output
-        )
-
-        print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))
-
-        # ['Das Haus ist wunderbar.', 'Ich arbeite gerne in NYC.']
-
-.. _scripts:
-
-Example scripts
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-T5 is supported by several example scripts, both for pre-training and fine-tuning.
-
-* pre-training: the `run_t5_mlm_flax.py
-  <https://github.com/huggingface/transformers/blob/master/examples/flax/language-modeling/run_t5_mlm_flax.py>`__
-  script allows you to further pre-train T5 or pre-train T5 from scratch on your own data. The `t5_tokenizer_model.py
-  <https://github.com/huggingface/transformers/blob/master/examples/flax/language-modeling/t5_tokenizer_model.py>`__
-  script allows you to further train a T5 tokenizer or train a T5 Tokenizer from scratch on your own data. Note that
-  Flax (a neural network library on top of JAX) is particularly useful to train on TPU hardware.
-
-* fine-tuning: T5 is supported by the official summarization scripts (`PyTorch
-  <https://github.com/huggingface/transformers/tree/master/examples/pytorch/summarization>`__, `Tensorflow
-  <https://github.com/huggingface/transformers/tree/master/examples/tensorflow/summarization>`__, and `Flax
-  <https://github.com/huggingface/transformers/tree/master/examples/flax/summarization>`__) and translation scripts
-  (`PyTorch <https://github.com/huggingface/transformers/tree/master/examples/pytorch/translation>`__ and `Tensorflow
-  <https://github.com/huggingface/transformers/tree/master/examples/tensorflow/translation>`__). These scripts allow
-  you to easily fine-tune T5 on custom data for summarization/translation.
-
-T5Config
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.T5Config
-    :members:
-
-
-T5Tokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.T5Tokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-T5TokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.T5TokenizerFast
-    :members:
-
-
-T5Model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.T5Model
-    :members: forward, parallelize, deparallelize
-
-
-T5ForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.T5ForConditionalGeneration
-    :members: forward, parallelize, deparallelize
-
-T5EncoderModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.T5EncoderModel
-    :members: forward, parallelize, deparallelize
-
-TFT5Model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFT5Model
-    :members: call
-
-
-TFT5ForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFT5ForConditionalGeneration
-    :members: call
-
-TFT5EncoderModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFT5EncoderModel
-    :members: call
-
-FlaxT5Model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxT5Model
-    :members: __call__, encode, decode
-
-FlaxT5ForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxT5ForConditionalGeneration
-    :members: __call__, encode, decode
diff --git a/docs/source/model_doc/t5v1.1.mdx b/docs/source/model_doc/t5v1.1.mdx
new file mode 100644
index 0000000000..5c829213a6
--- /dev/null
+++ b/docs/source/model_doc/t5v1.1.mdx
@@ -0,0 +1,61 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# T5v1.1
+
+## Overview
+
+T5v1.1 was released in the [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511)
+repository by Colin Raffel et al. It's an improved version of the original T5 model.
+
+One can directly plug in the weights of T5v1.1 into a T5 model, like so:
+
+```python
+from transformers import T5ForConditionalGeneration
+
+model = T5ForConditionalGeneration.from_pretrained('google/t5-v1_1-base')
+```
+
+T5 Version 1.1 includes the following improvements compared to the original T5 model:
+
+- GEGLU activation in the feed-forward hidden layer, rather than ReLU. See [this paper](https://arxiv.org/abs/2002.05202).
+
+- Dropout was turned off in pre-training (quality win). Dropout should be re-enabled during fine-tuning.
+
+- Pre-trained on C4 only without mixing in the downstream tasks.
+
+- No parameter sharing between the embedding and classifier layer.
+
+- "xl" and "xxl" replace "3B" and "11B". The model shapes are a bit different - larger `d_model` and smaller
+  `num_heads` and `d_ff`.
+
+Note: T5 Version 1.1 was only pre-trained on [C4](https://huggingface.co/datasets/c4) excluding any supervised
+training. Therefore, this model has to be fine-tuned before it is useable on a downstream task, unlike the original T5
+model. Since t5v1.1 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
+fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
+
+Google has released the following variants:
+
+- [google/t5-v1_1-small](https://huggingface.co/google/t5-v1_1-small)
+
+- [google/t5-v1_1-base](https://huggingface.co/google/t5-v1_1-base)
+
+- [google/t5-v1_1-large](https://huggingface.co/google/t5-v1_1-large)
+
+- [google/t5-v1_1-xl](https://huggingface.co/google/t5-v1_1-xl)
+
+- [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl).
+
+One can refer to [T5's documentation page](t5) for all tips, code examples and notebooks.
+
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The original code can be
+found [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511).
diff --git a/docs/source/model_doc/t5v1.1.rst b/docs/source/model_doc/t5v1.1.rst
deleted file mode 100644
index 9de0881e21..0000000000
--- a/docs/source/model_doc/t5v1.1.rst
+++ /dev/null
@@ -1,66 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-T5v1.1
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-T5v1.1 was released in the `google-research/text-to-text-transfer-transformer
-<https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511>`__
-repository by Colin Raffel et al. It's an improved version of the original T5 model.
-
-One can directly plug in the weights of T5v1.1 into a T5 model, like so:
-
-.. code-block::
-
-    from transformers import T5ForConditionalGeneration
-
-    model = T5ForConditionalGeneration.from_pretrained('google/t5-v1_1-base') 
-
-T5 Version 1.1 includes the following improvements compared to the original T5 model:
-
-- GEGLU activation in the feed-forward hidden layer, rather than ReLU. See `this paper
-  <https://arxiv.org/abs/2002.05202>`__.
-
-- Dropout was turned off in pre-training (quality win). Dropout should be re-enabled during fine-tuning.
-
-- Pre-trained on C4 only without mixing in the downstream tasks.
-
-- No parameter sharing between the embedding and classifier layer.
-
-- "xl" and "xxl" replace "3B" and "11B". The model shapes are a bit different - larger :obj:`d_model` and smaller
-  :obj:`num_heads` and :obj:`d_ff`.
-
-Note: T5 Version 1.1 was only pre-trained on `C4 <https://huggingface.co/datasets/c4>`__ excluding any supervised
-training. Therefore, this model has to be fine-tuned before it is useable on a downstream task, unlike the original T5
-model. Since t5v1.1 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
-fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
-
-Google has released the following variants:
-
-- `google/t5-v1_1-small <https://huggingface.co/google/t5-v1_1-small>`__
-
-- `google/t5-v1_1-base <https://huggingface.co/google/t5-v1_1-base>`__
-
-- `google/t5-v1_1-large <https://huggingface.co/google/t5-v1_1-large>`__
-
-- `google/t5-v1_1-xl <https://huggingface.co/google/t5-v1_1-xl>`__
-
-- `google/t5-v1_1-xxl <https://huggingface.co/google/t5-v1_1-xxl>`__.
-
-One can refer to :doc:`T5's documentation page <t5>` for all tips, code examples and notebooks.
-
-This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The original code can be
-found `here
-<https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511>`__.
diff --git a/docs/source/model_doc/tapas.mdx b/docs/source/model_doc/tapas.mdx
index dcee338672..fe22d63659 100644
--- a/docs/source/model_doc/tapas.mdx
+++ b/docs/source/model_doc/tapas.mdx
@@ -36,7 +36,7 @@ In addition, the authors have further pre-trained TAPAS to recognize **table ent
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tapas_architecture.png"
 alt="drawing" width="600"/> 
 
-<small> TAPAS architecture. Taken from the [official blog post](https://ai.googleblog.com/2020/04/using-neural-networks-to-find-answers.html). </small>
+<small> TAPAS architecture. Taken from the <a href="https://ai.googleblog.com/2020/04/using-neural-networks-to-find-answers.html">original blog post</a>.</small>
 
 This model was contributed by [nielsr](https://huggingface.co/nielsr). The Tensorflow version of this model was contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/google-research/tapas).
 
diff --git a/docs/source/model_doc/transformerxl.mdx b/docs/source/model_doc/transformerxl.mdx
new file mode 100644
index 0000000000..7e2a770142
--- /dev/null
+++ b/docs/source/model_doc/transformerxl.mdx
@@ -0,0 +1,103 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Transformer XL
+
+## Overview
+
+The Transformer-XL model was proposed in [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai, Zhilin Yang, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan
+Salakhutdinov. It's a causal (uni-directional) transformer with relative positioning (sinusoïdal) embeddings which can
+reuse previously computed hidden-states to attend to longer context (memory). This model also uses adaptive softmax
+inputs and outputs (tied).
+
+The abstract from the paper is the following:
+
+*Transformers have a potential of learning longer-term dependency, but are limited by a fixed-length context in the
+setting of language modeling. We propose a novel neural architecture Transformer-XL that enables learning dependency
+beyond a fixed length without disrupting temporal coherence. It consists of a segment-level recurrence mechanism and a
+novel positional encoding scheme. Our method not only enables capturing longer-term dependency, but also resolves the
+context fragmentation problem. As a result, Transformer-XL learns dependency that is 80% longer than RNNs and 450%
+longer than vanilla Transformers, achieves better performance on both short and long sequences, and is up to 1,800+
+times faster than vanilla Transformers during evaluation. Notably, we improve the state-of-the-art results of
+bpc/perplexity to 0.99 on enwiki8, 1.08 on text8, 18.3 on WikiText-103, 21.8 on One Billion Word, and 54.5 on Penn
+Treebank (without finetuning). When trained only on WikiText-103, Transformer-XL manages to generate reasonably
+coherent, novel text articles with thousands of tokens.*
+
+Tips:
+
+- Transformer-XL uses relative sinusoidal positional embeddings. Padding can be done on the left or on the right. The
+  original implementation trains on SQuAD with padding on the left, therefore the padding defaults are set to left.
+- Transformer-XL is one of the few models that has no sequence length limit.
+
+This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/kimiyoung/transformer-xl).
+
+<Tip warning={true}>
+
+TransformerXL does **not** work with *torch.nn.DataParallel* due to a bug in PyTorch, see [issue #36035](https://github.com/pytorch/pytorch/issues/36035)
+
+</Tip>
+
+
+## TransfoXLConfig
+
+[[autodoc]] TransfoXLConfig
+
+## TransfoXLTokenizer
+
+[[autodoc]] TransfoXLTokenizer
+    - save_vocabulary
+
+## TransfoXL specific outputs
+
+[[autodoc]] models.transfo_xl.modeling_transfo_xl.TransfoXLModelOutput
+
+[[autodoc]] models.transfo_xl.modeling_transfo_xl.TransfoXLLMHeadModelOutput
+
+[[autodoc]] models.transfo_xl.modeling_tf_transfo_xl.TFTransfoXLModelOutput
+
+[[autodoc]] models.transfo_xl.modeling_tf_transfo_xl.TFTransfoXLLMHeadModelOutput
+
+## TransfoXLModel
+
+[[autodoc]] TransfoXLModel
+    - forward
+
+## TransfoXLLMHeadModel
+
+[[autodoc]] TransfoXLLMHeadModel
+    - forward
+
+## TransfoXLForSequenceClassification
+
+[[autodoc]] TransfoXLForSequenceClassification
+    - forward
+
+## TFTransfoXLModel
+
+[[autodoc]] TFTransfoXLModel
+    - call
+
+## TFTransfoXLLMHeadModel
+
+[[autodoc]] TFTransfoXLLMHeadModel
+    - call
+
+## TFTransfoXLForSequenceClassification
+
+[[autodoc]] TFTransfoXLForSequenceClassification
+    - call
+
+## Internal Layers
+
+[[autodoc]] AdaptiveEmbedding
+
+[[autodoc]] TFAdaptiveEmbedding
diff --git a/docs/source/model_doc/transformerxl.rst b/docs/source/model_doc/transformerxl.rst
deleted file mode 100644
index 178268f522..0000000000
--- a/docs/source/model_doc/transformerxl.rst
+++ /dev/null
@@ -1,130 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Transformer XL
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The Transformer-XL model was proposed in `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context
-<https://arxiv.org/abs/1901.02860>`__ by Zihang Dai, Zhilin Yang, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan
-Salakhutdinov. It's a causal (uni-directional) transformer with relative positioning (sinusoïdal) embeddings which can
-reuse previously computed hidden-states to attend to longer context (memory). This model also uses adaptive softmax
-inputs and outputs (tied).
-
-The abstract from the paper is the following:
-
-*Transformers have a potential of learning longer-term dependency, but are limited by a fixed-length context in the
-setting of language modeling. We propose a novel neural architecture Transformer-XL that enables learning dependency
-beyond a fixed length without disrupting temporal coherence. It consists of a segment-level recurrence mechanism and a
-novel positional encoding scheme. Our method not only enables capturing longer-term dependency, but also resolves the
-context fragmentation problem. As a result, Transformer-XL learns dependency that is 80% longer than RNNs and 450%
-longer than vanilla Transformers, achieves better performance on both short and long sequences, and is up to 1,800+
-times faster than vanilla Transformers during evaluation. Notably, we improve the state-of-the-art results of
-bpc/perplexity to 0.99 on enwiki8, 1.08 on text8, 18.3 on WikiText-103, 21.8 on One Billion Word, and 54.5 on Penn
-Treebank (without finetuning). When trained only on WikiText-103, Transformer-XL manages to generate reasonably
-coherent, novel text articles with thousands of tokens.*
-
-Tips:
-
-- Transformer-XL uses relative sinusoidal positional embeddings. Padding can be done on the left or on the right. The
-  original implementation trains on SQuAD with padding on the left, therefore the padding defaults are set to left.
-- Transformer-XL is one of the few models that has no sequence length limit.
-
-This model was contributed by `thomwolf <https://huggingface.co/thomwolf>`__. The original code can be found `here
-<https://github.com/kimiyoung/transformer-xl>`__.
-
-**Note**:
-
-- TransformerXL does **not** work with `torch.nn.DataParallel` due to a bug in PyTorch, see `issue #36035
-  <https://github.com/pytorch/pytorch/issues/36035>`__
-
-
-TransfoXLConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TransfoXLConfig
-    :members:
-
-
-TransfoXLTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TransfoXLTokenizer
-    :members: save_vocabulary
-
-
-TransfoXL specific outputs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.models.transfo_xl.modeling_transfo_xl.TransfoXLModelOutput
-    :members:
-
-.. autoclass:: transformers.models.transfo_xl.modeling_transfo_xl.TransfoXLLMHeadModelOutput
-    :members:
-
-.. autoclass:: transformers.models.transfo_xl.modeling_tf_transfo_xl.TFTransfoXLModelOutput
-    :members:
-
-.. autoclass:: transformers.models.transfo_xl.modeling_tf_transfo_xl.TFTransfoXLLMHeadModelOutput
-    :members:
-
-
-TransfoXLModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TransfoXLModel
-    :members: forward
-
-
-TransfoXLLMHeadModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TransfoXLLMHeadModel
-    :members: forward
-
-
-TransfoXLForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TransfoXLForSequenceClassification
-    :members: forward
-
-
-TFTransfoXLModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFTransfoXLModel
-    :members: call
-
-
-TFTransfoXLLMHeadModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFTransfoXLLMHeadModel
-    :members: call
-
-
-TFTransfoXLForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFTransfoXLForSequenceClassification
-    :members: call
-
-
-Internal Layers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.AdaptiveEmbedding
-
-.. autoclass:: transformers.TFAdaptiveEmbedding
diff --git a/docs/source/model_doc/trocr.mdx b/docs/source/model_doc/trocr.mdx
index d090836597..961fce6d28 100644
--- a/docs/source/model_doc/trocr.mdx
+++ b/docs/source/model_doc/trocr.mdx
@@ -32,7 +32,7 @@ tasks.*
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/trocr_architecture.jpg"
 alt="drawing" width="600"/> 
 
-<small> TrOCR architecture. Taken from the [original paper](https://arxiv.org/abs/2109.10282). </small>
+<small> TrOCR architecture. Taken from the <a href="https://arxiv.org/abs/2109.10282">original paper</a>. </small>
 
 Please refer to the [`VisionEncoderDecoder`] class on how to use this model.
 
diff --git a/docs/source/model_doc/unispeech.mdx b/docs/source/model_doc/unispeech.mdx
new file mode 100644
index 0000000000..a55d759396
--- /dev/null
+++ b/docs/source/model_doc/unispeech.mdx
@@ -0,0 +1,71 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# UniSpeech
+
+## Overview
+
+The UniSpeech model was proposed in [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael
+Zeng, Xuedong Huang .
+
+The abstract from the paper is the following:
+
+*In this paper, we propose a unified pre-training approach called UniSpeech to learn speech representations with both
+unlabeled and labeled data, in which supervised phonetic CTC learning and phonetically-aware contrastive
+self-supervised learning are conducted in a multi-task learning manner. The resultant representations can capture
+information more correlated with phonetic structures and improve the generalization across languages and domains. We
+evaluate the effectiveness of UniSpeech for cross-lingual representation learning on public CommonVoice corpus. The
+results show that UniSpeech outperforms self-supervised pretraining and supervised transfer learning for speech
+recognition by a maximum of 13.4% and 17.8% relative phone error rate reductions respectively (averaged over all
+testing languages). The transferability of UniSpeech is also demonstrated on a domain-shift speech recognition task,
+i.e., a relative word error rate reduction of 6% against the previous approach.*
+
+Tips:
+
+- UniSpeech is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. Please
+  use [`Wav2Vec2Processor`] for the feature extraction.
+- UniSpeech model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be
+  decoded using [`Wav2Vec2CTCTokenizer`].
+
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be
+found [here](https://github.com/microsoft/UniSpeech/tree/main/UniSpeech).
+
+
+## UniSpeechConfig
+
+[[autodoc]] UniSpeechConfig
+
+## UniSpeech specific outputs
+
+[[autodoc]] models.unispeech.modeling_unispeech.UniSpeechBaseModelOutput
+
+[[autodoc]] models.unispeech.modeling_unispeech.UniSpeechForPreTrainingOutput
+
+## UniSpeechModel
+
+[[autodoc]] UniSpeechModel
+    - forward
+
+## UniSpeechForCTC
+
+[[autodoc]] UniSpeechForCTC
+    - forward
+
+## UniSpeechForSequenceClassification
+
+[[autodoc]] UniSpeechForSequenceClassification
+    - forward
+
+## UniSpeechForPreTraining
+
+[[autodoc]] UniSpeechForPreTraining
+    - forward
diff --git a/docs/source/model_doc/unispeech.rst b/docs/source/model_doc/unispeech.rst
deleted file mode 100644
index 5ca29d5590..0000000000
--- a/docs/source/model_doc/unispeech.rst
+++ /dev/null
@@ -1,88 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-UniSpeech
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The UniSpeech model was proposed in `UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data
-<https://arxiv.org/abs/2101.07597>`__ by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael
-Zeng, Xuedong Huang .
-
-The abstract from the paper is the following:
-
-*In this paper, we propose a unified pre-training approach called UniSpeech to learn speech representations with both
-unlabeled and labeled data, in which supervised phonetic CTC learning and phonetically-aware contrastive
-self-supervised learning are conducted in a multi-task learning manner. The resultant representations can capture
-information more correlated with phonetic structures and improve the generalization across languages and domains. We
-evaluate the effectiveness of UniSpeech for cross-lingual representation learning on public CommonVoice corpus. The
-results show that UniSpeech outperforms self-supervised pretraining and supervised transfer learning for speech
-recognition by a maximum of 13.4% and 17.8% relative phone error rate reductions respectively (averaged over all
-testing languages). The transferability of UniSpeech is also demonstrated on a domain-shift speech recognition task,
-i.e., a relative word error rate reduction of 6% against the previous approach.*
-
-Tips:
-
-- UniSpeech is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. Please
-  use :class:`~transformers.Wav2Vec2Processor` for the feature extraction.
-- UniSpeech model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be
-  decoded using :class:`~transformers.Wav2Vec2CTCTokenizer`.
-
-This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The Authors' code can be
-found `here <https://github.com/microsoft/UniSpeech/tree/main/UniSpeech>`__.
-
-
-UniSpeechConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.UniSpeechConfig
-    :members:
-
-
-UniSpeech specific outputs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.models.unispeech.modeling_unispeech.UniSpeechBaseModelOutput
-    :members: 
-
-.. autoclass:: transformers.models.unispeech.modeling_unispeech.UniSpeechForPreTrainingOutput
-    :members: 
-
-
-UniSpeechModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.UniSpeechModel
-    :members: forward
-
-
-UniSpeechForCTC
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.UniSpeechForCTC
-    :members: forward
-
-
-UniSpeechForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.UniSpeechForSequenceClassification
-    :members: forward
-
-
-UniSpeechForPreTraining
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.UniSpeechForPreTraining
-    :members: forward
diff --git a/docs/source/model_doc/unispeech_sat.mdx b/docs/source/model_doc/unispeech_sat.mdx
new file mode 100644
index 0000000000..724f5f908a
--- /dev/null
+++ b/docs/source/model_doc/unispeech_sat.mdx
@@ -0,0 +1,86 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# UniSpeech-SAT
+
+## Overview
+
+The UniSpeech-SAT model was proposed in [UniSpeech-SAT: Universal Speech Representation Learning with Speaker Aware
+Pre-Training](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen,
+Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu .
+
+The abstract from the paper is the following:
+
+*Self-supervised learning (SSL) is a long-standing goal for speech processing, since it utilizes large-scale unlabeled
+data and avoids extensive human labeling. Recent years witness great successes in applying self-supervised learning in
+speech recognition, while limited exploration was attempted in applying SSL for modeling speaker characteristics. In
+this paper, we aim to improve the existing SSL framework for speaker representation learning. Two methods are
+introduced for enhancing the unsupervised speaker information extraction. First, we apply the multi-task learning to
+the current SSL framework, where we integrate the utterance-wise contrastive loss with the SSL objective function.
+Second, for better speaker discrimination, we propose an utterance mixing strategy for data augmentation, where
+additional overlapped utterances are created unsupervisely and incorporate during training. We integrate the proposed
+methods into the HuBERT framework. Experiment results on SUPERB benchmark show that the proposed system achieves
+state-of-the-art performance in universal representation learning, especially for speaker identification oriented
+tasks. An ablation study is performed verifying the efficacy of each proposed method. Finally, we scale up training
+dataset to 94 thousand hours public audio data and achieve further performance improvement in all SUPERB tasks.*
+
+Tips:
+
+- UniSpeechSat is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
+  Please use [`Wav2Vec2Processor`] for the feature extraction.
+- UniSpeechSat model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be
+  decoded using [`Wav2Vec2CTCTokenizer`].
+- UniSpeechSat performs especially well on speaker verification, speaker identification, and speaker diarization tasks.
+
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be
+found [here](https://github.com/microsoft/UniSpeech/tree/main/UniSpeech-SAT).
+
+
+## UniSpeechSatConfig
+
+[[autodoc]] UniSpeechSatConfig
+
+## UniSpeechSat specific outputs
+
+[[autodoc]] models.unispeech_sat.modeling_unispeech_sat.UniSpeechSatBaseModelOutput
+
+[[autodoc]] models.unispeech_sat.modeling_unispeech_sat.UniSpeechSatForPreTrainingOutput
+
+## UniSpeechSatModel
+
+[[autodoc]] UniSpeechSatModel
+    - forward
+
+## UniSpeechSatForCTC
+
+[[autodoc]] UniSpeechSatForCTC
+    - forward
+
+## UniSpeechSatForSequenceClassification
+
+[[autodoc]] UniSpeechSatForSequenceClassification
+    - forward
+
+## UniSpeechSatForAudioFrameClassification
+
+[[autodoc]] UniSpeechSatForAudioFrameClassification
+    - forward
+
+## UniSpeechSatForXVector
+
+[[autodoc]] UniSpeechSatForXVector
+    - forward
+
+## UniSpeechSatForPreTraining
+
+[[autodoc]] UniSpeechSatForPreTraining
+    - forward
diff --git a/docs/source/model_doc/unispeech_sat.rst b/docs/source/model_doc/unispeech_sat.rst
deleted file mode 100644
index 96c4311b06..0000000000
--- a/docs/source/model_doc/unispeech_sat.rst
+++ /dev/null
@@ -1,106 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-UniSpeech-SAT
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The UniSpeech-SAT model was proposed in `UniSpeech-SAT: Universal Speech Representation Learning with Speaker Aware
-Pre-Training <https://arxiv.org/abs/2110.05752>`__ by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen,
-Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu .
-
-The abstract from the paper is the following:
-
-*Self-supervised learning (SSL) is a long-standing goal for speech processing, since it utilizes large-scale unlabeled
-data and avoids extensive human labeling. Recent years witness great successes in applying self-supervised learning in
-speech recognition, while limited exploration was attempted in applying SSL for modeling speaker characteristics. In
-this paper, we aim to improve the existing SSL framework for speaker representation learning. Two methods are
-introduced for enhancing the unsupervised speaker information extraction. First, we apply the multi-task learning to
-the current SSL framework, where we integrate the utterance-wise contrastive loss with the SSL objective function.
-Second, for better speaker discrimination, we propose an utterance mixing strategy for data augmentation, where
-additional overlapped utterances are created unsupervisely and incorporate during training. We integrate the proposed
-methods into the HuBERT framework. Experiment results on SUPERB benchmark show that the proposed system achieves
-state-of-the-art performance in universal representation learning, especially for speaker identification oriented
-tasks. An ablation study is performed verifying the efficacy of each proposed method. Finally, we scale up training
-dataset to 94 thousand hours public audio data and achieve further performance improvement in all SUPERB tasks.*
-
-Tips:
-
-- UniSpeechSat is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
-  Please use :class:`~transformers.Wav2Vec2Processor` for the feature extraction.
-- UniSpeechSat model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be
-  decoded using :class:`~transformers.Wav2Vec2CTCTokenizer`.
-- UniSpeechSat performs especially well on speaker verification, speaker identification, and speaker diarization tasks.
-
-This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The Authors' code can be
-found `here <https://github.com/microsoft/UniSpeech/tree/main/UniSpeech-SAT>`__.
-
-
-UniSpeechSatConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.UniSpeechSatConfig
-    :members:
-
-
-UniSpeechSat specific outputs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.models.unispeech_sat.modeling_unispeech_sat.UniSpeechSatBaseModelOutput
-    :members: 
-
-.. autoclass:: transformers.models.unispeech_sat.modeling_unispeech_sat.UniSpeechSatForPreTrainingOutput
-    :members: 
-
-
-UniSpeechSatModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.UniSpeechSatModel
-    :members: forward
-
-
-UniSpeechSatForCTC
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.UniSpeechSatForCTC
-    :members: forward
-
-
-UniSpeechSatForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.UniSpeechSatForSequenceClassification
-    :members: forward
-
-
-UniSpeechSatForAudioFrameClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.UniSpeechSatForAudioFrameClassification
-    :members: forward
-
-
-UniSpeechSatForXVector
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.UniSpeechSatForXVector
-    :members: forward
-
-
-UniSpeechSatForPreTraining
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.UniSpeechSatForPreTraining
-    :members: forward
diff --git a/docs/source/model_doc/vision_text_dual_encoder.mdx b/docs/source/model_doc/vision_text_dual_encoder.mdx
new file mode 100644
index 0000000000..fcc4f38288
--- /dev/null
+++ b/docs/source/model_doc/vision_text_dual_encoder.mdx
@@ -0,0 +1,43 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# VisionTextDualEncoder
+
+## Overview
+
+The [`VisionTextDualEncoderModel`] can be used to initialize a vision-text dual encoder model with
+any pretrained vision autoencoding model as the vision encoder (*e.g.* [ViT](vit), [BEiT](beit), [DeiT](deit)) and any pretrained text autoencoding model as the text encoder (*e.g.* [RoBERTa](roberta), [BERT](bert)). Two projection layers are added on top of both the vision and text encoder to project the output embeddings
+to a shared latent space. The projection layers are randomly initialized so the model should be fine-tuned on a
+downstream task. This model can be used to align the vision-text embeddings using CLIP like contrastive image-text
+training and then can be used for zero-shot vision tasks such image-classification or retrieval.
+
+In [LiT: Zero-Shot Transfer with Locked-image Text Tuning](https://arxiv.org/abs/2111.07991) it is shown how
+leveraging pre-trained (locked/frozen) image and text model for contrastive learning yields significant improvment on
+new zero-shot vision tasks such as image classification or retrieval.
+
+## VisionTextDualEncoderConfig
+
+[[autodoc]] VisionTextDualEncoderConfig
+
+## VisionTextDualEncoderProcessor
+
+[[autodoc]] VisionTextDualEncoderProcessor
+
+## VisionTextDualEncoderModel
+
+[[autodoc]] VisionTextDualEncoderModel
+    - forward
+
+## FlaxVisionTextDualEncoderModel
+
+[[autodoc]] FlaxVisionTextDualEncoderModel
+    - __call__
diff --git a/docs/source/model_doc/vision_text_dual_encoder.rst b/docs/source/model_doc/vision_text_dual_encoder.rst
deleted file mode 100644
index 2544a5388a..0000000000
--- a/docs/source/model_doc/vision_text_dual_encoder.rst
+++ /dev/null
@@ -1,56 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-VisionTextDualEncoder
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The :class:`~transformers.VisionTextDualEncoderModel` can be used to initialize a vision-text dual encoder model with
-any pretrained vision autoencoding model as the vision encoder (*e.g.* :doc:`ViT <vit>`, :doc:`BEiT <beit>`, :doc:`DeiT
-<deit>`) and any pretrained text autoencoding model as the text encoder (*e.g.* :doc:`RoBERTa <roberta>`, :doc:`BERT
-<bert>`). Two projection layers are added on top of both the vision and text encoder to project the output embeddings
-to a shared latent space. The projection layers are randomly initialized so the model should be fine-tuned on a
-downstream task. This model can be used to align the vision-text embeddings using CLIP like contrastive image-text
-training and then can be used for zero-shot vision tasks such image-classification or retrieval.
-
-In `LiT: Zero-Shot Transfer with Locked-image Text Tuning <https://arxiv.org/abs/2111.07991>`__ it is shown how
-leveraging pre-trained (locked/frozen) image and text model for contrastive learning yields significant improvment on
-new zero-shot vision tasks such as image classification or retrieval.
-
-VisionTextDualEncoderConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.VisionTextDualEncoderConfig
-    :members:
-
-
-VisionTextDualEncoderProcessor
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.VisionTextDualEncoderProcessor
-    :members:
-
-
-VisionTextDualEncoderModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.VisionTextDualEncoderModel
-    :members: forward
-
-
-FlaxVisionTextDualEncoderModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxVisionTextDualEncoderModel
-    :members: __call__
diff --git a/docs/source/model_doc/visionencoderdecoder.mdx b/docs/source/model_doc/visionencoderdecoder.mdx
new file mode 100644
index 0000000000..128dbaec92
--- /dev/null
+++ b/docs/source/model_doc/visionencoderdecoder.mdx
@@ -0,0 +1,40 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Vision Encoder Decoder Models
+
+The [`VisionEncoderDecoderModel`] can be used to initialize an image-to-text-sequence model with any
+pretrained vision autoencoding model as the encoder (*e.g.* [ViT](vit), [BEiT](beit), [DeiT](deit))
+and any pretrained language model as the decoder (*e.g.* [RoBERTa](roberta), [GPT2](gpt2), [BERT](bert)).
+
+The effectiveness of initializing image-to-text-sequence models with pretrained checkpoints has been shown in (for
+example) [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang,
+Zhoujun Li, Furu Wei.
+
+An example of how to use a [`VisionEncoderDecoderModel`] for inference can be seen in [TrOCR](trocr).
+
+
+## VisionEncoderDecoderConfig
+
+[[autodoc]] VisionEncoderDecoderConfig
+
+## VisionEncoderDecoderModel
+
+[[autodoc]] VisionEncoderDecoderModel
+    - forward
+    - from_encoder_decoder_pretrained
+
+## FlaxVisionEncoderDecoderModel
+
+[[autodoc]] FlaxVisionEncoderDecoderModel
+    - __call__
+    - from_encoder_decoder_pretrained
diff --git a/docs/source/model_doc/visionencoderdecoder.rst b/docs/source/model_doc/visionencoderdecoder.rst
deleted file mode 100644
index 08a5fe4718..0000000000
--- a/docs/source/model_doc/visionencoderdecoder.rst
+++ /dev/null
@@ -1,48 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Vision Encoder Decoder Models
------------------------------------------------------------------------------------------------------------------------
-
-The :class:`~transformers.VisionEncoderDecoderModel` can be used to initialize an image-to-text-sequence model with any
-pretrained vision autoencoding model as the encoder (*e.g.* :doc:`ViT <vit>`, :doc:`BEiT <beit>`, :doc:`DeiT <deit>`)
-and any pretrained language model as the decoder (*e.g.* :doc:`RoBERTa <roberta>`, :doc:`GPT2 <gpt2>`, :doc:`BERT
-<bert>`).
-
-The effectiveness of initializing image-to-text-sequence models with pretrained checkpoints has been shown in (for
-example) `TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models
-<https://arxiv.org/abs/2109.10282>`__ by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang,
-Zhoujun Li, Furu Wei.
-
-An example of how to use a :class:`~transformers.VisionEncoderDecoderModel` for inference can be seen in :doc:`TrOCR
-<trocr>`.
-
-
-VisionEncoderDecoderConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.VisionEncoderDecoderConfig
-    :members:
-
-
-VisionEncoderDecoderModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.VisionEncoderDecoderModel
-    :members: forward, from_encoder_decoder_pretrained
-
-
-FlaxVisionEncoderDecoderModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxVisionEncoderDecoderModel
-    :members: __call__, from_encoder_decoder_pretrained
diff --git a/docs/source/model_doc/visual_bert.mdx b/docs/source/model_doc/visual_bert.mdx
new file mode 100644
index 0000000000..f8ec0714a7
--- /dev/null
+++ b/docs/source/model_doc/visual_bert.mdx
@@ -0,0 +1,123 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# VisualBERT
+
+## Overview
+
+The VisualBERT model was proposed in [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
+VisualBERT is a neural network trained on a variety of (image, text) pairs.
+
+The abstract from the paper is the following:
+
+*We propose VisualBERT, a simple and flexible framework for modeling a broad range of vision-and-language tasks.
+VisualBERT consists of a stack of Transformer layers that implicitly align elements of an input text and regions in an
+associated input image with self-attention. We further propose two visually-grounded language model objectives for
+pre-training VisualBERT on image caption data. Experiments on four vision-and-language tasks including VQA, VCR, NLVR2,
+and Flickr30K show that VisualBERT outperforms or rivals with state-of-the-art models while being significantly
+simpler. Further analysis demonstrates that VisualBERT can ground elements of language to image regions without any
+explicit supervision and is even sensitive to syntactic relationships, tracking, for example, associations between
+verbs and image regions corresponding to their arguments.*
+
+Tips:
+
+1. Most of the checkpoints provided work with the [`VisualBertForPreTraining`] configuration. Other
+   checkpoints provided are the fine-tuned checkpoints for down-stream tasks - VQA ('visualbert-vqa'), VCR
+   ('visualbert-vcr'), NLVR2 ('visualbert-nlvr2'). Hence, if you are not working on these downstream tasks, it is
+   recommended that you use the pretrained checkpoints.
+
+2. For the VCR task, the authors use a fine-tuned detector for generating visual embeddings, for all the checkpoints.
+   We do not provide the detector and its weights as a part of the package, but it will be available in the research
+   projects, and the states can be loaded directly into the detector provided.
+
+## Usage
+
+VisualBERT is a multi-modal vision and language model. It can be used for visual question answering, multiple choice,
+visual reasoning and region-to-phrase correspondence tasks. VisualBERT uses a BERT-like transformer to prepare
+embeddings for image-text pairs. Both the text and visual features are then projected to a latent space with identical
+dimension.
+
+To feed images to the model, each image is passed through a pre-trained object detector and the regions and the
+bounding boxes are extracted. The authors use the features generated after passing these regions through a pre-trained
+CNN like ResNet as visual embeddings. They also add absolute position embeddings, and feed the resulting sequence of
+vectors to a standard BERT model. The text input is concatenated in the front of the visual embeddings in the embedding
+layer, and is expected to be bound by [CLS] and a [SEP] tokens, as in BERT. The segment IDs must also be set
+appropriately for the textual and visual parts.
+
+The [`BertTokenizer`] is used to encode the text. A custom detector/feature extractor must be used
+to get the visual embeddings. The following example notebooks show how to use VisualBERT with Detectron-like models:
+
+- [VisualBERT VQA demo notebook](https://github.com/huggingface/transformers/tree/master/examples/research_projects/visual_bert) : This notebook
+  contains an example on VisualBERT VQA.
+
+- [Generate Embeddings for VisualBERT (Colab Notebook)](https://colab.research.google.com/drive/1bLGxKdldwqnMVA5x4neY7-l_8fKGWQYI?usp=sharing) : This notebook contains
+  an example on how to generate visual embeddings.
+
+The following example shows how to get the last hidden state using [`VisualBertModel`]:
+
+```python
+>>> import torch
+>>> from transformers import BertTokenizer, VisualBertModel
+
+>>> model = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
+>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+
+>>> inputs = tokenizer("What is the man eating?", return_tensors="pt")
+>>> # this is a custom function that returns the visual embeddings given the image path
+>>> visual_embeds = get_visual_embeddings(image_path)
+
+>>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
+>>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
+>>> inputs.update({
+...     "visual_embeds": visual_embeds,
+...     "visual_token_type_ids": visual_token_type_ids,
+...     "visual_attention_mask": visual_attention_mask
+... })
+>>> outputs = model(**inputs)
+>>> last_hidden_state = outputs.last_hidden_state
+```
+
+This model was contributed by [gchhablani](https://huggingface.co/gchhablani). The original code can be found [here](https://github.com/uclanlp/visualbert).
+
+## VisualBertConfig
+
+[[autodoc]] VisualBertConfig
+
+## VisualBertModel
+
+[[autodoc]] VisualBertModel
+    - forward
+
+## VisualBertForPreTraining
+
+[[autodoc]] VisualBertForPreTraining
+    - forward
+
+## VisualBertForQuestionAnswering
+
+[[autodoc]] VisualBertForQuestionAnswering
+    - forward
+
+## VisualBertForMultipleChoice
+
+[[autodoc]] VisualBertForMultipleChoice
+    - forward
+
+## VisualBertForVisualReasoning
+
+[[autodoc]] VisualBertForVisualReasoning
+    - forward
+
+## VisualBertForRegionToPhraseAlignment
+
+[[autodoc]] VisualBertForRegionToPhraseAlignment
+    - forward
diff --git a/docs/source/model_doc/visual_bert.rst b/docs/source/model_doc/visual_bert.rst
deleted file mode 100644
index 7258b12ad8..0000000000
--- a/docs/source/model_doc/visual_bert.rst
+++ /dev/null
@@ -1,143 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-VisualBERT
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The VisualBERT model was proposed in `VisualBERT: A Simple and Performant Baseline for Vision and Language
-<https://arxiv.org/pdf/1908.03557>`__ by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-VisualBERT is a neural network trained on a variety of (image, text) pairs.
-
-The abstract from the paper is the following:
-
-*We propose VisualBERT, a simple and flexible framework for modeling a broad range of vision-and-language tasks.
-VisualBERT consists of a stack of Transformer layers that implicitly align elements of an input text and regions in an
-associated input image with self-attention. We further propose two visually-grounded language model objectives for
-pre-training VisualBERT on image caption data. Experiments on four vision-and-language tasks including VQA, VCR, NLVR2,
-and Flickr30K show that VisualBERT outperforms or rivals with state-of-the-art models while being significantly
-simpler. Further analysis demonstrates that VisualBERT can ground elements of language to image regions without any
-explicit supervision and is even sensitive to syntactic relationships, tracking, for example, associations between
-verbs and image regions corresponding to their arguments.*
-
-Tips:
-
-1. Most of the checkpoints provided work with the :class:`~transformers.VisualBertForPreTraining` configuration. Other
-   checkpoints provided are the fine-tuned checkpoints for down-stream tasks - VQA ('visualbert-vqa'), VCR
-   ('visualbert-vcr'), NLVR2 ('visualbert-nlvr2'). Hence, if you are not working on these downstream tasks, it is
-   recommended that you use the pretrained checkpoints.
-
-2. For the VCR task, the authors use a fine-tuned detector for generating visual embeddings, for all the checkpoints.
-   We do not provide the detector and its weights as a part of the package, but it will be available in the research
-   projects, and the states can be loaded directly into the detector provided.
-
-Usage
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-VisualBERT is a multi-modal vision and language model. It can be used for visual question answering, multiple choice,
-visual reasoning and region-to-phrase correspondence tasks. VisualBERT uses a BERT-like transformer to prepare
-embeddings for image-text pairs. Both the text and visual features are then projected to a latent space with identical
-dimension.
-
-To feed images to the model, each image is passed through a pre-trained object detector and the regions and the
-bounding boxes are extracted. The authors use the features generated after passing these regions through a pre-trained
-CNN like ResNet as visual embeddings. They also add absolute position embeddings, and feed the resulting sequence of
-vectors to a standard BERT model. The text input is concatenated in the front of the visual embeddings in the embedding
-layer, and is expected to be bound by [CLS] and a [SEP] tokens, as in BERT. The segment IDs must also be set
-appropriately for the textual and visual parts.
-
-The :class:`~transformers.BertTokenizer` is used to encode the text. A custom detector/feature extractor must be used
-to get the visual embeddings. The following example notebooks show how to use VisualBERT with Detectron-like models:
-
-* `VisualBERT VQA demo notebook
-  <https://github.com/huggingface/transformers/tree/master/examples/research_projects/visual_bert>`__ : This notebook
-  contains an example on VisualBERT VQA.
-
-* `Generate Embeddings for VisualBERT (Colab Notebook)
-  <https://colab.research.google.com/drive/1bLGxKdldwqnMVA5x4neY7-l_8fKGWQYI?usp=sharing>`__ : This notebook contains
-  an example on how to generate visual embeddings.
-
-The following example shows how to get the last hidden state using :class:`~transformers.VisualBertModel`:
-
-.. code-block::
-
-        >>> import torch
-        >>> from transformers import BertTokenizer, VisualBertModel
-
-        >>> model = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
-        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-
-        >>> inputs = tokenizer("What is the man eating?", return_tensors="pt")
-        >>> # this is a custom function that returns the visual embeddings given the image path
-        >>> visual_embeds = get_visual_embeddings(image_path)
-
-        >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
-        >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
-        >>> inputs.update({
-        ...     "visual_embeds": visual_embeds,
-        ...     "visual_token_type_ids": visual_token_type_ids,
-        ...     "visual_attention_mask": visual_attention_mask
-        ... })
-        >>> outputs = model(**inputs)
-        >>> last_hidden_state = outputs.last_hidden_state
-
-This model was contributed by `gchhablani <https://huggingface.co/gchhablani>`__. The original code can be found `here
-<https://github.com/uclanlp/visualbert>`__.
-
-VisualBertConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.VisualBertConfig
-    :members:
-
-VisualBertModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.VisualBertModel
-    :members: forward
-
-
-VisualBertForPreTraining
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.VisualBertForPreTraining
-    :members: forward
-
-
-VisualBertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.VisualBertForQuestionAnswering
-    :members: forward
-
-
-VisualBertForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.VisualBertForMultipleChoice
-    :members: forward
-
-
-VisualBertForVisualReasoning
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.VisualBertForVisualReasoning
-    :members: forward
-
-
-VisualBertForRegionToPhraseAlignment
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.VisualBertForRegionToPhraseAlignment
-    :members: forward
diff --git a/docs/source/model_doc/vit.mdx b/docs/source/model_doc/vit.mdx
new file mode 100644
index 0000000000..42afd0c87f
--- /dev/null
+++ b/docs/source/model_doc/vit.mdx
@@ -0,0 +1,127 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Vision Transformer (ViT)
+
+<Tip>
+
+This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
+breaking changes to fix it in the future. If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title).
+
+</Tip>
+
+## Overview
+
+The Vision Transformer (ViT) model was proposed in [An Image is Worth 16x16 Words: Transformers for Image Recognition
+at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk
+Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob
+Uszkoreit, Neil Houlsby. It's the first paper that successfully trains a Transformer encoder on ImageNet, attaining
+very good results compared to familiar convolutional architectures.
+
+
+The abstract from the paper is the following:
+
+*While the Transformer architecture has become the de-facto standard for natural language processing tasks, its
+applications to computer vision remain limited. In vision, attention is either applied in conjunction with
+convolutional networks, or used to replace certain components of convolutional networks while keeping their overall
+structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to
+sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of
+data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.),
+Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring
+substantially fewer computational resources to train.*
+
+Tips:
+
+- Demo notebooks regarding inference as well as fine-tuning ViT on custom data can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer).
+- To feed images to the Transformer encoder, each image is split into a sequence of fixed-size non-overlapping patches,
+  which are then linearly embedded. A [CLS] token is added to serve as representation of an entire image, which can be
+  used for classification. The authors also add absolute position embeddings, and feed the resulting sequence of
+  vectors to a standard Transformer encoder.
+- As the Vision Transformer expects each image to be of the same size (resolution), one can use
+  [`ViTFeatureExtractor`] to resize (or rescale) and normalize images for the model.
+- Both the patch resolution and image resolution used during pre-training or fine-tuning are reflected in the name of
+  each checkpoint. For example, `google/vit-base-patch16-224` refers to a base-sized architecture with patch
+  resolution of 16x16 and fine-tuning resolution of 224x224. All checkpoints can be found on the [hub](https://huggingface.co/models?search=vit).
+- The available checkpoints are either (1) pre-trained on [ImageNet-21k](http://www.image-net.org/) (a collection of
+  14 million images and 21k classes) only, or (2) also fine-tuned on [ImageNet](http://www.image-net.org/challenges/LSVRC/2012/) (also referred to as ILSVRC 2012, a collection of 1.3 million
+  images and 1,000 classes).
+- The Vision Transformer was pre-trained using a resolution of 224x224. During fine-tuning, it is often beneficial to
+  use a higher resolution than pre-training [(Touvron et al., 2019)](https://arxiv.org/abs/1906.06423), [(Kolesnikov
+  et al., 2020)](https://arxiv.org/abs/1912.11370). In order to fine-tune at higher resolution, the authors perform
+  2D interpolation of the pre-trained position embeddings, according to their location in the original image.
+- The best results are obtained with supervised pre-training, which is not the case in NLP. The authors also performed
+  an experiment with a self-supervised pre-training objective, namely masked patched prediction (inspired by masked
+  language modeling). With this approach, the smaller ViT-B/16 model achieves 79.9% accuracy on ImageNet, a significant
+  improvement of 2% to training from scratch, but still 4% behind supervised pre-training.
+
+Following the original Vision Transformer, some follow-up works have been made:
+
+- DeiT (Data-efficient Image Transformers) by Facebook AI. DeiT models are distilled vision transformers. Refer to
+  [DeiT's documentation page](deit). The authors of DeiT also released more efficiently trained ViT models, which
+  you can directly plug into [`ViTModel`] or [`ViTForImageClassification`]. There
+  are 4 variants available (in 3 different sizes): *facebook/deit-tiny-patch16-224*, *facebook/deit-small-patch16-224*,
+  *facebook/deit-base-patch16-224* and *facebook/deit-base-patch16-384*. Note that one should use
+  [`DeiTFeatureExtractor`] in order to prepare images for the model.
+
+- BEiT (BERT pre-training of Image Transformers) by Microsoft Research. BEiT models outperform supervised pre-trained
+  vision transformers using a self-supervised method inspired by BERT (masked image modeling) and based on a VQ-VAE.
+  Refer to [BEiT's documentation page](beit).
+
+- DINO (a method for self-supervised training of Vision Transformers) by Facebook AI. Vision Transformers trained using
+  the DINO method show very interesting properties not seen with convolutional models. They are capable of segmenting
+  objects, without having ever been trained to do so. DINO checkpoints can be found on the [hub](https://huggingface.co/models?other=dino).
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code (written in JAX) can be
+found [here](https://github.com/google-research/vision_transformer).
+
+Note that we converted the weights from Ross Wightman's [timm library](https://github.com/rwightman/pytorch-image-models), who already converted the weights from JAX to PyTorch. Credits
+go to him!
+
+
+## ViTConfig
+
+[[autodoc]] ViTConfig
+
+## ViTFeatureExtractor
+
+[[autodoc]] ViTFeatureExtractor
+    - __call__
+
+## ViTModel
+
+[[autodoc]] ViTModel
+    - forward
+
+## ViTForImageClassification
+
+[[autodoc]] ViTForImageClassification
+    - forward
+
+## TFViTModel
+
+[[autodoc]] TFViTModel
+    - call
+
+## TFViTForImageClassification
+
+[[autodoc]] TFViTForImageClassification
+    - call
+
+## FlaxVitModel
+
+[[autodoc]] FlaxViTModel
+    - __call__
+
+## FlaxViTForImageClassification
+
+[[autodoc]] FlaxViTForImageClassification
+    - __call__
diff --git a/docs/source/model_doc/vit.rst b/docs/source/model_doc/vit.rst
deleted file mode 100644
index 4a465517a9..0000000000
--- a/docs/source/model_doc/vit.rst
+++ /dev/null
@@ -1,151 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Vision Transformer (ViT)
------------------------------------------------------------------------------------------------------------------------
-
-.. note::
-
-    This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
-    breaking changes to fix it in the future. If you see something strange, file a `Github Issue
-    <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__.
-
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The Vision Transformer (ViT) model was proposed in `An Image is Worth 16x16 Words: Transformers for Image Recognition
-at Scale <https://arxiv.org/abs/2010.11929>`__ by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk
-Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob
-Uszkoreit, Neil Houlsby. It's the first paper that successfully trains a Transformer encoder on ImageNet, attaining
-very good results compared to familiar convolutional architectures.
-
-
-The abstract from the paper is the following:
-
-*While the Transformer architecture has become the de-facto standard for natural language processing tasks, its
-applications to computer vision remain limited. In vision, attention is either applied in conjunction with
-convolutional networks, or used to replace certain components of convolutional networks while keeping their overall
-structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to
-sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of
-data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.),
-Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring
-substantially fewer computational resources to train.*
-
-Tips:
-
-- Demo notebooks regarding inference as well as fine-tuning ViT on custom data can be found `here
-  <https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer>`__.
-- To feed images to the Transformer encoder, each image is split into a sequence of fixed-size non-overlapping patches,
-  which are then linearly embedded. A [CLS] token is added to serve as representation of an entire image, which can be
-  used for classification. The authors also add absolute position embeddings, and feed the resulting sequence of
-  vectors to a standard Transformer encoder.
-- As the Vision Transformer expects each image to be of the same size (resolution), one can use
-  :class:`~transformers.ViTFeatureExtractor` to resize (or rescale) and normalize images for the model.
-- Both the patch resolution and image resolution used during pre-training or fine-tuning are reflected in the name of
-  each checkpoint. For example, :obj:`google/vit-base-patch16-224` refers to a base-sized architecture with patch
-  resolution of 16x16 and fine-tuning resolution of 224x224. All checkpoints can be found on the `hub
-  <https://huggingface.co/models?search=vit>`__.
-- The available checkpoints are either (1) pre-trained on `ImageNet-21k <http://www.image-net.org/>`__ (a collection of
-  14 million images and 21k classes) only, or (2) also fine-tuned on `ImageNet
-  <http://www.image-net.org/challenges/LSVRC/2012/>`__ (also referred to as ILSVRC 2012, a collection of 1.3 million
-  images and 1,000 classes).
-- The Vision Transformer was pre-trained using a resolution of 224x224. During fine-tuning, it is often beneficial to
-  use a higher resolution than pre-training `(Touvron et al., 2019) <https://arxiv.org/abs/1906.06423>`__, `(Kolesnikov
-  et al., 2020) <https://arxiv.org/abs/1912.11370>`__. In order to fine-tune at higher resolution, the authors perform
-  2D interpolation of the pre-trained position embeddings, according to their location in the original image.
-- The best results are obtained with supervised pre-training, which is not the case in NLP. The authors also performed
-  an experiment with a self-supervised pre-training objective, namely masked patched prediction (inspired by masked
-  language modeling). With this approach, the smaller ViT-B/16 model achieves 79.9% accuracy on ImageNet, a significant
-  improvement of 2% to training from scratch, but still 4% behind supervised pre-training.
-
-Following the original Vision Transformer, some follow-up works have been made:
-
-- DeiT (Data-efficient Image Transformers) by Facebook AI. DeiT models are distilled vision transformers. Refer to
-  :doc:`DeiT's documentation page <deit>`. The authors of DeiT also released more efficiently trained ViT models, which
-  you can directly plug into :class:`~transformers.ViTModel` or :class:`~transformers.ViTForImageClassification`. There
-  are 4 variants available (in 3 different sizes): `facebook/deit-tiny-patch16-224`, `facebook/deit-small-patch16-224`,
-  `facebook/deit-base-patch16-224` and `facebook/deit-base-patch16-384`. Note that one should use
-  :class:`~transformers.DeiTFeatureExtractor` in order to prepare images for the model.
-
-- BEiT (BERT pre-training of Image Transformers) by Microsoft Research. BEiT models outperform supervised pre-trained
-  vision transformers using a self-supervised method inspired by BERT (masked image modeling) and based on a VQ-VAE.
-  Refer to :doc:`BEiT's documentation page <beit>`.
-
-- DINO (a method for self-supervised training of Vision Transformers) by Facebook AI. Vision Transformers trained using
-  the DINO method show very interesting properties not seen with convolutional models. They are capable of segmenting
-  objects, without having ever been trained to do so. DINO checkpoints can be found on the `hub
-  <https://huggingface.co/models?other=dino>`__.
-
-This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code (written in JAX) can be
-found `here <https://github.com/google-research/vision_transformer>`__.
-
-Note that we converted the weights from Ross Wightman's `timm library
-<https://github.com/rwightman/pytorch-image-models>`__, who already converted the weights from JAX to PyTorch. Credits
-go to him!
-
-
-ViTConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ViTConfig
-    :members:
-
-
-ViTFeatureExtractor
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ViTFeatureExtractor
-    :members: __call__
-
-
-ViTModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ViTModel
-    :members: forward
-
-
-ViTForImageClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ViTForImageClassification
-    :members: forward
-
-
-TFViTModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFViTModel
-    :members: call
-
-
-TFViTForImageClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFViTForImageClassification
-    :members: call
-
-
-FlaxVitModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxViTModel
-    :members: __call__
-
-
-FlaxViTForImageClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxViTForImageClassification
-    :members: __call__
-
diff --git a/docs/source/model_doc/wav2vec2.mdx b/docs/source/model_doc/wav2vec2.mdx
new file mode 100644
index 0000000000..11d3444cc7
--- /dev/null
+++ b/docs/source/model_doc/wav2vec2.mdx
@@ -0,0 +1,141 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Wav2Vec2
+
+## Overview
+
+The Wav2Vec2 model was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
+
+The abstract from the paper is the following:
+
+*We show for the first time that learning powerful representations from speech audio alone followed by fine-tuning on
+transcribed speech can outperform the best semi-supervised methods while being conceptually simpler. wav2vec 2.0 masks
+the speech input in the latent space and solves a contrastive task defined over a quantization of the latent
+representations which are jointly learned. Experiments using all labeled data of Librispeech achieve 1.8/3.3 WER on the
+clean/other test sets. When lowering the amount of labeled data to one hour, wav2vec 2.0 outperforms the previous state
+of the art on the 100 hour subset while using 100 times less labeled data. Using just ten minutes of labeled data and
+pre-training on 53k hours of unlabeled data still achieves 4.8/8.2 WER. This demonstrates the feasibility of speech
+recognition with limited amounts of labeled data.*
+
+Tips:
+
+- Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
+- Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be decoded
+  using [`Wav2Vec2CTCTokenizer`].
+
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten).
+
+
+## Wav2Vec2Config
+
+[[autodoc]] Wav2Vec2Config
+
+## Wav2Vec2CTCTokenizer
+
+[[autodoc]] Wav2Vec2CTCTokenizer
+    - __call__
+    - save_vocabulary
+
+## Wav2Vec2FeatureExtractor
+
+[[autodoc]] Wav2Vec2FeatureExtractor
+    - __call__
+
+## Wav2Vec2Processor
+
+[[autodoc]] Wav2Vec2Processor
+    - __call__
+    - pad
+    - from_pretrained
+    - save_pretrained
+    - batch_decode
+    - decode
+    - as_target_processor
+
+## Wav2Vec2ProcessorWithLM
+
+[[autodoc]] Wav2Vec2ProcessorWithLM
+    - __call__
+    - pad
+    - from_pretrained
+    - save_pretrained
+    - batch_decode
+    - decode
+    - as_target_processor
+
+## Wav2Vec2 specific outputs
+
+[[autodoc]] models.wav2vec2_with_lm.processing_wav2vec2_with_lm.Wav2Vec2DecoderWithLMOutput
+
+[[autodoc]] models.wav2vec2.modeling_wav2vec2.Wav2Vec2BaseModelOutput
+
+[[autodoc]] models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTrainingOutput
+
+[[autodoc]] models.wav2vec2.modeling_flax_wav2vec2.FlaxWav2Vec2BaseModelOutput
+
+[[autodoc]] models.wav2vec2.modeling_flax_wav2vec2.FlaxWav2Vec2ForPreTrainingOutput
+
+## Wav2Vec2Model
+
+[[autodoc]] Wav2Vec2Model
+    - forward
+
+## Wav2Vec2ForCTC
+
+[[autodoc]] Wav2Vec2ForCTC
+    - forward
+
+## Wav2Vec2ForSequenceClassification
+
+[[autodoc]] Wav2Vec2ForSequenceClassification
+    - forward
+
+## Wav2Vec2ForAudioFrameClassification
+
+[[autodoc]] Wav2Vec2ForAudioFrameClassification
+    - forward
+
+## Wav2Vec2ForXVector
+
+[[autodoc]] Wav2Vec2ForXVector
+    - forward
+
+## Wav2Vec2ForPreTraining
+
+[[autodoc]] Wav2Vec2ForPreTraining
+    - forward
+
+## TFWav2Vec2Model
+
+[[autodoc]] TFWav2Vec2Model
+    - call
+
+## TFWav2Vec2ForCTC
+
+[[autodoc]] TFWav2Vec2ForCTC
+    - call
+
+## FlaxWav2Vec2Model
+
+[[autodoc]] FlaxWav2Vec2Model
+    - __call__
+
+## FlaxWav2Vec2ForCTC
+
+[[autodoc]] FlaxWav2Vec2ForCTC
+    - __call__
+
+## FlaxWav2Vec2ForPreTraining
+
+[[autodoc]] FlaxWav2Vec2ForPreTraining
+    - __call__
diff --git a/docs/source/model_doc/wav2vec2.rst b/docs/source/model_doc/wav2vec2.rst
deleted file mode 100644
index b988f11408..0000000000
--- a/docs/source/model_doc/wav2vec2.rst
+++ /dev/null
@@ -1,169 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Wav2Vec2
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The Wav2Vec2 model was proposed in `wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations
-<https://arxiv.org/abs/2006.11477>`__ by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
-
-The abstract from the paper is the following:
-
-*We show for the first time that learning powerful representations from speech audio alone followed by fine-tuning on
-transcribed speech can outperform the best semi-supervised methods while being conceptually simpler. wav2vec 2.0 masks
-the speech input in the latent space and solves a contrastive task defined over a quantization of the latent
-representations which are jointly learned. Experiments using all labeled data of Librispeech achieve 1.8/3.3 WER on the
-clean/other test sets. When lowering the amount of labeled data to one hour, wav2vec 2.0 outperforms the previous state
-of the art on the 100 hour subset while using 100 times less labeled data. Using just ten minutes of labeled data and
-pre-training on 53k hours of unlabeled data still achieves 4.8/8.2 WER. This demonstrates the feasibility of speech
-recognition with limited amounts of labeled data.*
-
-Tips:
-
-- Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
-- Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be decoded
-  using :class:`~transformers.Wav2Vec2CTCTokenizer`.
-
-This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__.
-
-
-Wav2Vec2Config
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Wav2Vec2Config
-    :members:
-
-
-Wav2Vec2CTCTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Wav2Vec2CTCTokenizer
-    :members: __call__, save_vocabulary
-
-
-Wav2Vec2FeatureExtractor
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Wav2Vec2FeatureExtractor
-    :members: __call__
-
-
-Wav2Vec2Processor
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Wav2Vec2Processor
-    :members: __call__, pad, from_pretrained, save_pretrained, batch_decode, decode, as_target_processor
-
-
-Wav2Vec2ProcessorWithLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Wav2Vec2ProcessorWithLM
-    :members: __call__, pad, from_pretrained, save_pretrained, batch_decode, decode, as_target_processor
-
-
-Wav2Vec2 specific outputs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.models.wav2vec2_with_lm.processing_wav2vec2_with_lm.Wav2Vec2DecoderWithLMOutput
-    :members: 
-
-.. autoclass:: transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2BaseModelOutput
-    :members: 
-
-.. autoclass:: transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTrainingOutput
-    :members: 
-
-.. autoclass:: transformers.models.wav2vec2.modeling_flax_wav2vec2.FlaxWav2Vec2BaseModelOutput
-    :members: 
-
-.. autoclass:: transformers.models.wav2vec2.modeling_flax_wav2vec2.FlaxWav2Vec2ForPreTrainingOutput
-    :members: 
-
-
-Wav2Vec2Model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Wav2Vec2Model
-    :members: forward
-
-
-Wav2Vec2ForCTC
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Wav2Vec2ForCTC
-    :members: forward
-
-
-Wav2Vec2ForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Wav2Vec2ForSequenceClassification
-    :members: forward
-
-
-Wav2Vec2ForAudioFrameClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Wav2Vec2ForAudioFrameClassification
-    :members: forward
-
-
-Wav2Vec2ForXVector
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Wav2Vec2ForXVector
-    :members: forward
-
-
-Wav2Vec2ForPreTraining
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Wav2Vec2ForPreTraining
-    :members: forward
-
-
-TFWav2Vec2Model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFWav2Vec2Model
-    :members: call
-
-
-TFWav2Vec2ForCTC
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFWav2Vec2ForCTC
-    :members: call
-
-
-FlaxWav2Vec2Model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxWav2Vec2Model
-    :members: __call__
-
-
-FlaxWav2Vec2ForCTC
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxWav2Vec2ForCTC
-    :members: __call__
-
-FlaxWav2Vec2ForPreTraining
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxWav2Vec2ForPreTraining
-    :members: __call__
diff --git a/docs/source/model_doc/wavlm.mdx b/docs/source/model_doc/wavlm.mdx
new file mode 100644
index 0000000000..254321cd7f
--- /dev/null
+++ b/docs/source/model_doc/wavlm.mdx
@@ -0,0 +1,79 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# WavLM
+
+## Overview
+
+The WavLM model was proposed in [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen,
+Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu,
+Michael Zeng, Furu Wei.
+
+The abstract from the paper is the following:
+
+*Self-supervised learning (SSL) achieves great success in speech recognition, while limited exploration has been
+attempted for other speech processing tasks. As speech signal contains multi-faceted information including speaker
+identity, paralinguistics, spoken content, etc., learning universal representations for all speech tasks is
+challenging. In this paper, we propose a new pre-trained model, WavLM, to solve full-stack downstream speech tasks.
+WavLM is built based on the HuBERT framework, with an emphasis on both spoken content modeling and speaker identity
+preservation. We first equip the Transformer structure with gated relative position bias to improve its capability on
+recognition tasks. For better speaker discrimination, we propose an utterance mixing training strategy, where
+additional overlapped utterances are created unsupervisely and incorporated during model training. Lastly, we scale up
+the training dataset from 60k hours to 94k hours. WavLM Large achieves state-of-the-art performance on the SUPERB
+benchmark, and brings significant improvements for various speech processing tasks on their representative benchmarks.*
+
+Tips:
+
+- WavLM is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. Please use
+  [`Wav2Vec2Processor`] for the feature extraction.
+- WavLM model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded
+  using [`Wav2Vec2CTCTokenizer`].
+- WavLM performs especially well on speaker verification, speaker identification, and speaker diarization tasks.
+
+Relevant checkpoints can be found under https://huggingface.co/models?other=wavlm.
+
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be
+found [here](https://github.com/microsoft/unilm/tree/master/wavlm).
+
+
+## WavLMConfig
+
+[[autodoc]] WavLMConfig
+
+## WavLM specific outputs
+
+[[autodoc]] models.wavlm.modeling_wavlm.WavLMBaseModelOutput
+
+## WavLMModel
+
+[[autodoc]] WavLMModel
+    - forward
+
+## WavLMForCTC
+
+[[autodoc]] WavLMForCTC
+    - forward
+
+## WavLMForSequenceClassification
+
+[[autodoc]] WavLMForSequenceClassification
+    - forward
+
+## WavLMForAudioFrameClassification
+
+[[autodoc]] WavLMForAudioFrameClassification
+    - forward
+
+## WavLMForXVector
+
+[[autodoc]] WavLMForXVector
+    - forward
diff --git a/docs/source/model_doc/wavlm.rst b/docs/source/model_doc/wavlm.rst
deleted file mode 100644
index 59d030e83b..0000000000
--- a/docs/source/model_doc/wavlm.rst
+++ /dev/null
@@ -1,97 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-WavLM
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The WavLM model was proposed in `WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing
-<https://arxiv.org/abs/2110.13900>`__ by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen,
-Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu,
-Michael Zeng, Furu Wei.
-
-The abstract from the paper is the following:
-
-*Self-supervised learning (SSL) achieves great success in speech recognition, while limited exploration has been
-attempted for other speech processing tasks. As speech signal contains multi-faceted information including speaker
-identity, paralinguistics, spoken content, etc., learning universal representations for all speech tasks is
-challenging. In this paper, we propose a new pre-trained model, WavLM, to solve full-stack downstream speech tasks.
-WavLM is built based on the HuBERT framework, with an emphasis on both spoken content modeling and speaker identity
-preservation. We first equip the Transformer structure with gated relative position bias to improve its capability on
-recognition tasks. For better speaker discrimination, we propose an utterance mixing training strategy, where
-additional overlapped utterances are created unsupervisely and incorporated during model training. Lastly, we scale up
-the training dataset from 60k hours to 94k hours. WavLM Large achieves state-of-the-art performance on the SUPERB
-benchmark, and brings significant improvements for various speech processing tasks on their representative benchmarks.*
-
-Tips:
-
-- WavLM is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. Please use
-  :class:`~transformers.Wav2Vec2Processor` for the feature extraction.
-- WavLM model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded
-  using :class:`~transformers.Wav2Vec2CTCTokenizer`.
-- WavLM performs especially well on speaker verification, speaker identification, and speaker diarization tasks.
-
-Relevant checkpoints can be found under https://huggingface.co/models?other=wavlm.
-
-This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The Authors' code can be
-found `here <https://github.com/microsoft/unilm/tree/master/wavlm>`__.
-
-
-WavLMConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.WavLMConfig
-    :members:
-
-
-WavLM specific outputs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.models.wavlm.modeling_wavlm.WavLMBaseModelOutput
-    :members: 
-
-
-WavLMModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.WavLMModel
-    :members: forward
-
-
-WavLMForCTC
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.WavLMForCTC
-    :members: forward
-
-
-WavLMForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.WavLMForSequenceClassification
-    :members: forward
-
-
-WavLMForAudioFrameClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.WavLMForAudioFrameClassification
-    :members: forward
-
-
-WavLMForXVector
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.WavLMForXVector
-    :members: forward
diff --git a/docs/source/model_doc/xlm.mdx b/docs/source/model_doc/xlm.mdx
new file mode 100644
index 0000000000..a441c64c86
--- /dev/null
+++ b/docs/source/model_doc/xlm.mdx
@@ -0,0 +1,124 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# XLM
+
+## Overview
+
+The XLM model was proposed in [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by
+Guillaume Lample, Alexis Conneau. It's a transformer pretrained using one of the following objectives:
+
+- a causal language modeling (CLM) objective (next token prediction),
+- a masked language modeling (MLM) objective (BERT-like), or
+- a Translation Language Modeling (TLM) object (extension of BERT's MLM to multiple language inputs)
+
+The abstract from the paper is the following:
+
+*Recent studies have demonstrated the efficiency of generative pretraining for English natural language understanding.
+In this work, we extend this approach to multiple languages and show the effectiveness of cross-lingual pretraining. We
+propose two methods to learn cross-lingual language models (XLMs): one unsupervised that only relies on monolingual
+data, and one supervised that leverages parallel data with a new cross-lingual language model objective. We obtain
+state-of-the-art results on cross-lingual classification, unsupervised and supervised machine translation. On XNLI, our
+approach pushes the state of the art by an absolute gain of 4.9% accuracy. On unsupervised machine translation, we
+obtain 34.3 BLEU on WMT'16 German-English, improving the previous state of the art by more than 9 BLEU. On supervised
+machine translation, we obtain a new state of the art of 38.5 BLEU on WMT'16 Romanian-English, outperforming the
+previous best approach by more than 4 BLEU. Our code and pretrained models will be made publicly available.*
+
+Tips:
+
+- XLM has many different checkpoints, which were trained using different objectives: CLM, MLM or TLM. Make sure to
+  select the correct objective for your task (e.g. MLM checkpoints are not suitable for generation).
+- XLM has multilingual checkpoints which leverage a specific `lang` parameter. Check out the [multi-lingual](../multilingual) page for more information.
+
+This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/facebookresearch/XLM/).
+
+
+## XLMConfig
+
+[[autodoc]] XLMConfig
+
+## XLMTokenizer
+
+[[autodoc]] XLMTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## XLM specific outputs
+
+[[autodoc]] models.xlm.modeling_xlm.XLMForQuestionAnsweringOutput
+
+## XLMModel
+
+[[autodoc]] XLMModel
+    - forward
+
+## XLMWithLMHeadModel
+
+[[autodoc]] XLMWithLMHeadModel
+    - forward
+
+## XLMForSequenceClassification
+
+[[autodoc]] XLMForSequenceClassification
+    - forward
+
+## XLMForMultipleChoice
+
+[[autodoc]] XLMForMultipleChoice
+    - forward
+
+## XLMForTokenClassification
+
+[[autodoc]] XLMForTokenClassification
+    - forward
+
+## XLMForQuestionAnsweringSimple
+
+[[autodoc]] XLMForQuestionAnsweringSimple
+    - forward
+
+## XLMForQuestionAnswering
+
+[[autodoc]] XLMForQuestionAnswering
+    - forward
+
+## TFXLMModel
+
+[[autodoc]] TFXLMModel
+    - call
+
+## TFXLMWithLMHeadModel
+
+[[autodoc]] TFXLMWithLMHeadModel
+    - call
+
+## TFXLMForSequenceClassification
+
+[[autodoc]] TFXLMForSequenceClassification
+    - call
+
+## TFXLMForMultipleChoice
+
+[[autodoc]] TFXLMForMultipleChoice
+    - call
+
+## TFXLMForTokenClassification
+
+[[autodoc]] TFXLMForTokenClassification
+    - call
+
+## TFXLMForQuestionAnsweringSimple
+
+[[autodoc]] TFXLMForQuestionAnsweringSimple
+    - call
diff --git a/docs/source/model_doc/xlm.rst b/docs/source/model_doc/xlm.rst
deleted file mode 100644
index 5a837714c5..0000000000
--- a/docs/source/model_doc/xlm.rst
+++ /dev/null
@@ -1,159 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-XLM
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The XLM model was proposed in `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`__ by
-Guillaume Lample, Alexis Conneau. It's a transformer pretrained using one of the following objectives:
-
-- a causal language modeling (CLM) objective (next token prediction),
-- a masked language modeling (MLM) objective (BERT-like), or
-- a Translation Language Modeling (TLM) object (extension of BERT's MLM to multiple language inputs)
-
-The abstract from the paper is the following:
-
-*Recent studies have demonstrated the efficiency of generative pretraining for English natural language understanding.
-In this work, we extend this approach to multiple languages and show the effectiveness of cross-lingual pretraining. We
-propose two methods to learn cross-lingual language models (XLMs): one unsupervised that only relies on monolingual
-data, and one supervised that leverages parallel data with a new cross-lingual language model objective. We obtain
-state-of-the-art results on cross-lingual classification, unsupervised and supervised machine translation. On XNLI, our
-approach pushes the state of the art by an absolute gain of 4.9% accuracy. On unsupervised machine translation, we
-obtain 34.3 BLEU on WMT'16 German-English, improving the previous state of the art by more than 9 BLEU. On supervised
-machine translation, we obtain a new state of the art of 38.5 BLEU on WMT'16 Romanian-English, outperforming the
-previous best approach by more than 4 BLEU. Our code and pretrained models will be made publicly available.*
-
-Tips:
-
-- XLM has many different checkpoints, which were trained using different objectives: CLM, MLM or TLM. Make sure to
-  select the correct objective for your task (e.g. MLM checkpoints are not suitable for generation).
-- XLM has multilingual checkpoints which leverage a specific :obj:`lang` parameter. Check out the :doc:`multi-lingual
-  <../multilingual>` page for more information.
-
-This model was contributed by `thomwolf <https://huggingface.co/thomwolf>`__. The original code can be found `here
-<https://github.com/facebookresearch/XLM/>`__.
-
-
-XLMConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMConfig
-    :members:
-
-XLMTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMTokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-XLM specific outputs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.models.xlm.modeling_xlm.XLMForQuestionAnsweringOutput
-    :members:
-
-
-XLMModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMModel
-    :members: forward
-
-
-XLMWithLMHeadModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMWithLMHeadModel
-    :members: forward
-
-
-XLMForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMForSequenceClassification
-    :members: forward
-
-
-XLMForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMForMultipleChoice
-    :members: forward
-
-
-XLMForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMForTokenClassification
-    :members: forward
-
-
-XLMForQuestionAnsweringSimple
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMForQuestionAnsweringSimple
-    :members: forward
-
-
-XLMForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMForQuestionAnswering
-    :members: forward
-
-
-TFXLMModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLMModel
-    :members: call
-
-
-TFXLMWithLMHeadModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLMWithLMHeadModel
-    :members: call
-
-
-TFXLMForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLMForSequenceClassification
-    :members: call
-
-
-TFXLMForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLMForMultipleChoice
-    :members: call
-
-
-TFXLMForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLMForTokenClassification
-    :members: call
-
-
-
-TFXLMForQuestionAnsweringSimple
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLMForQuestionAnsweringSimple
-    :members: call
diff --git a/docs/source/model_doc/xlmprophetnet.mdx b/docs/source/model_doc/xlmprophetnet.mdx
new file mode 100644
index 0000000000..af4a3bb6e8
--- /dev/null
+++ b/docs/source/model_doc/xlmprophetnet.mdx
@@ -0,0 +1,68 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# XLM-ProphetNet
+
+**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign
+@patrickvonplaten
+
+
+## Overview
+
+The XLM-ProphetNet model was proposed in [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training,](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei
+Zhang, Ming Zhou on 13 Jan, 2020.
+
+XLM-ProphetNet is an encoder-decoder model and can predict n-future tokens for "ngram" language modeling instead of
+just the next token. Its architecture is identical to ProhpetNet, but the model was trained on the multi-lingual
+"wiki100" Wikipedia dump.
+
+The abstract from the paper is the following:
+
+*In this paper, we present a new sequence-to-sequence pretraining model called ProphetNet, which introduces a novel
+self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of
+the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by
+n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time
+step. The future n-gram prediction explicitly encourages the model to plan for the future tokens and prevent
+overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale
+dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for
+abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new
+state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus.*
+
+The Authors' code can be found [here](https://github.com/microsoft/ProphetNet).
+
+## XLMProphetNetConfig
+
+[[autodoc]] XLMProphetNetConfig
+
+## XLMProphetNetTokenizer
+
+[[autodoc]] XLMProphetNetTokenizer
+
+## XLMProphetNetModel
+
+[[autodoc]] XLMProphetNetModel
+
+## XLMProphetNetEncoder
+
+[[autodoc]] XLMProphetNetEncoder
+
+## XLMProphetNetDecoder
+
+[[autodoc]] XLMProphetNetDecoder
+
+## XLMProphetNetForConditionalGeneration
+
+[[autodoc]] XLMProphetNetForConditionalGeneration
+
+## XLMProphetNetForCausalLM
+
+[[autodoc]] XLMProphetNetForCausalLM
diff --git a/docs/source/model_doc/xlmprophetnet.rst b/docs/source/model_doc/xlmprophetnet.rst
deleted file mode 100644
index bfe0467973..0000000000
--- a/docs/source/model_doc/xlmprophetnet.rst
+++ /dev/null
@@ -1,87 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-XLM-ProphetNet
------------------------------------------------------------------------------------------------------------------------
-
-**DISCLAIMER:** If you see something strange, file a `Github Issue
-<https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
-@patrickvonplaten
-
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The XLM-ProphetNet model was proposed in `ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training,
-<https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei
-Zhang, Ming Zhou on 13 Jan, 2020.
-
-XLM-ProphetNet is an encoder-decoder model and can predict n-future tokens for "ngram" language modeling instead of
-just the next token. Its architecture is identical to ProhpetNet, but the model was trained on the multi-lingual
-"wiki100" Wikipedia dump.
-
-The abstract from the paper is the following:
-
-*In this paper, we present a new sequence-to-sequence pretraining model called ProphetNet, which introduces a novel
-self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of
-the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by
-n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time
-step. The future n-gram prediction explicitly encourages the model to plan for the future tokens and prevent
-overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale
-dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for
-abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new
-state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus.*
-
-The Authors' code can be found `here <https://github.com/microsoft/ProphetNet>`__.
-
-XLMProphetNetConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMProphetNetConfig
-    :members:
-
-
-XLMProphetNetTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMProphetNetTokenizer
-    :members:
-
-
-XLMProphetNetModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMProphetNetModel
-
-
-XLMProphetNetEncoder
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMProphetNetEncoder
-
-
-XLMProphetNetDecoder
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMProphetNetDecoder
-
-
-XLMProphetNetForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMProphetNetForConditionalGeneration
-
-
-XLMProphetNetForCausalLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMProphetNetForCausalLM
diff --git a/docs/source/model_doc/xlmroberta.mdx b/docs/source/model_doc/xlmroberta.mdx
new file mode 100644
index 0000000000..bcfdca1817
--- /dev/null
+++ b/docs/source/model_doc/xlmroberta.mdx
@@ -0,0 +1,126 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# XLM-RoBERTa
+
+## Overview
+
+The XLM-RoBERTa model was proposed in [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume
+Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on Facebook's
+RoBERTa model released in 2019. It is a large multi-lingual language model, trained on 2.5TB of filtered CommonCrawl
+data.
+
+The abstract from the paper is the following:
+
+*This paper shows that pretraining multilingual language models at scale leads to significant performance gains for a
+wide range of cross-lingual transfer tasks. We train a Transformer-based masked language model on one hundred
+languages, using more than two terabytes of filtered CommonCrawl data. Our model, dubbed XLM-R, significantly
+outperforms multilingual BERT (mBERT) on a variety of cross-lingual benchmarks, including +13.8% average accuracy on
+XNLI, +12.3% average F1 score on MLQA, and +2.1% average F1 score on NER. XLM-R performs particularly well on
+low-resource languages, improving 11.8% in XNLI accuracy for Swahili and 9.2% for Urdu over the previous XLM model. We
+also present a detailed empirical evaluation of the key factors that are required to achieve these gains, including the
+trade-offs between (1) positive transfer and capacity dilution and (2) the performance of high and low resource
+languages at scale. Finally, we show, for the first time, the possibility of multilingual modeling without sacrificing
+per-language performance; XLM-Ris very competitive with strong monolingual models on the GLUE and XNLI benchmarks. We
+will make XLM-R code, data, and models publicly available.*
+
+Tips:
+
+- XLM-RoBERTa is a multilingual model trained on 100 different languages. Unlike some XLM multilingual models, it does
+  not require `lang` tensors to understand which language is used, and should be able to determine the correct
+  language from the input ids.
+- This implementation is the same as RoBERTa. Refer to the [documentation of RoBERTa](roberta) for usage examples
+  as well as the information relative to the inputs and outputs.
+
+This model was contributed by [stefan-it](https://huggingface.co/stefan-it). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/xlmr).
+
+
+## XLMRobertaConfig
+
+[[autodoc]] XLMRobertaConfig
+
+## XLMRobertaTokenizer
+
+[[autodoc]] XLMRobertaTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## XLMRobertaTokenizerFast
+
+[[autodoc]] XLMRobertaTokenizerFast
+
+## XLMRobertaModel
+
+[[autodoc]] XLMRobertaModel
+    - forward
+
+## XLMRobertaForCausalLM
+
+[[autodoc]] XLMRobertaForCausalLM
+    - forward
+
+## XLMRobertaForMaskedLM
+
+[[autodoc]] XLMRobertaForMaskedLM
+    - forward
+
+## XLMRobertaForSequenceClassification
+
+[[autodoc]] XLMRobertaForSequenceClassification
+    - forward
+
+## XLMRobertaForMultipleChoice
+
+[[autodoc]] XLMRobertaForMultipleChoice
+    - forward
+
+## XLMRobertaForTokenClassification
+
+[[autodoc]] XLMRobertaForTokenClassification
+    - forward
+
+## XLMRobertaForQuestionAnswering
+
+[[autodoc]] XLMRobertaForQuestionAnswering
+    - forward
+
+## TFXLMRobertaModel
+
+[[autodoc]] TFXLMRobertaModel
+    - call
+
+## TFXLMRobertaForMaskedLM
+
+[[autodoc]] TFXLMRobertaForMaskedLM
+    - call
+
+## TFXLMRobertaForSequenceClassification
+
+[[autodoc]] TFXLMRobertaForSequenceClassification
+    - call
+
+## TFXLMRobertaForMultipleChoice
+
+[[autodoc]] TFXLMRobertaForMultipleChoice
+    - call
+
+## TFXLMRobertaForTokenClassification
+
+[[autodoc]] TFXLMRobertaForTokenClassification
+    - call
+
+## TFXLMRobertaForQuestionAnswering
+
+[[autodoc]] TFXLMRobertaForQuestionAnswering
+    - call
diff --git a/docs/source/model_doc/xlmroberta.rst b/docs/source/model_doc/xlmroberta.rst
deleted file mode 100644
index c24bbf7f50..0000000000
--- a/docs/source/model_doc/xlmroberta.rst
+++ /dev/null
@@ -1,161 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-XLM-RoBERTa
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The XLM-RoBERTa model was proposed in `Unsupervised Cross-lingual Representation Learning at Scale
-<https://arxiv.org/abs/1911.02116>`__ by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume
-Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on Facebook's
-RoBERTa model released in 2019. It is a large multi-lingual language model, trained on 2.5TB of filtered CommonCrawl
-data.
-
-The abstract from the paper is the following:
-
-*This paper shows that pretraining multilingual language models at scale leads to significant performance gains for a
-wide range of cross-lingual transfer tasks. We train a Transformer-based masked language model on one hundred
-languages, using more than two terabytes of filtered CommonCrawl data. Our model, dubbed XLM-R, significantly
-outperforms multilingual BERT (mBERT) on a variety of cross-lingual benchmarks, including +13.8% average accuracy on
-XNLI, +12.3% average F1 score on MLQA, and +2.1% average F1 score on NER. XLM-R performs particularly well on
-low-resource languages, improving 11.8% in XNLI accuracy for Swahili and 9.2% for Urdu over the previous XLM model. We
-also present a detailed empirical evaluation of the key factors that are required to achieve these gains, including the
-trade-offs between (1) positive transfer and capacity dilution and (2) the performance of high and low resource
-languages at scale. Finally, we show, for the first time, the possibility of multilingual modeling without sacrificing
-per-language performance; XLM-Ris very competitive with strong monolingual models on the GLUE and XNLI benchmarks. We
-will make XLM-R code, data, and models publicly available.*
-
-Tips:
-
-- XLM-RoBERTa is a multilingual model trained on 100 different languages. Unlike some XLM multilingual models, it does
-  not require :obj:`lang` tensors to understand which language is used, and should be able to determine the correct
-  language from the input ids.
-- This implementation is the same as RoBERTa. Refer to the :doc:`documentation of RoBERTa <roberta>` for usage examples
-  as well as the information relative to the inputs and outputs.
-
-This model was contributed by `stefan-it <https://huggingface.co/stefan-it>`__. The original code can be found `here
-<https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`__.
-
-
-XLMRobertaConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaConfig
-    :members:
-
-
-XLMRobertaTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaTokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-XLMRobertaTokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaTokenizerFast
-    :members:
-
-
-XLMRobertaModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaModel
-    :members: forward
-
-
-XLMRobertaForCausalLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaForCausalLM
-    :members: forward
-
-
-XLMRobertaForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaForMaskedLM
-    :members: forward
-
-
-XLMRobertaForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaForSequenceClassification
-    :members: forward
-
-
-XLMRobertaForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaForMultipleChoice
-    :members: forward
-
-
-XLMRobertaForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaForTokenClassification
-    :members: forward
-
-
-XLMRobertaForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaForQuestionAnswering
-    :members: forward
-
-
-TFXLMRobertaModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLMRobertaModel
-    :members: call
-
-
-TFXLMRobertaForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLMRobertaForMaskedLM
-    :members: call
-
-
-TFXLMRobertaForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLMRobertaForSequenceClassification
-    :members: call
-
-
-TFXLMRobertaForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLMRobertaForMultipleChoice
-    :members: call
-
-
-TFXLMRobertaForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLMRobertaForTokenClassification
-    :members: call
-
-
-TFXLMRobertaForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLMRobertaForQuestionAnswering
-    :members: call
diff --git a/docs/source/model_doc/xlnet.mdx b/docs/source/model_doc/xlnet.mdx
new file mode 100644
index 0000000000..ca30574690
--- /dev/null
+++ b/docs/source/model_doc/xlnet.mdx
@@ -0,0 +1,154 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# XLNet
+
+## Overview
+
+The XLNet model was proposed in [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang, Zihang Dai, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov,
+Quoc V. Le. XLnet is an extension of the Transformer-XL model pre-trained using an autoregressive method to learn
+bidirectional contexts by maximizing the expected likelihood over all permutations of the input sequence factorization
+order.
+
+The abstract from the paper is the following:
+
+*With the capability of modeling bidirectional contexts, denoising autoencoding based pretraining like BERT achieves
+better performance than pretraining approaches based on autoregressive language modeling. However, relying on
+corrupting the input with masks, BERT neglects dependency between the masked positions and suffers from a
+pretrain-finetune discrepancy. In light of these pros and cons, we propose XLNet, a generalized autoregressive
+pretraining method that (1) enables learning bidirectional contexts by maximizing the expected likelihood over all
+permutations of the factorization order and (2) overcomes the limitations of BERT thanks to its autoregressive
+formulation. Furthermore, XLNet integrates ideas from Transformer-XL, the state-of-the-art autoregressive model, into
+pretraining. Empirically, under comparable experiment settings, XLNet outperforms BERT on 20 tasks, often by a large
+margin, including question answering, natural language inference, sentiment analysis, and document ranking.*
+
+Tips:
+
+- The specific attention pattern can be controlled at training and test time using the `perm_mask` input.
+- Due to the difficulty of training a fully auto-regressive model over various factorization order, XLNet is pretrained
+  using only a sub-set of the output tokens as target which are selected with the `target_mapping` input.
+- To use XLNet for sequential decoding (i.e. not in fully bi-directional setting), use the `perm_mask` and
+  `target_mapping` inputs to control the attention span and outputs (see examples in
+  *examples/pytorch/text-generation/run_generation.py*)
+- XLNet is one of the few models that has no sequence length limit.
+
+This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/zihangdai/xlnet/).
+
+
+## XLNetConfig
+
+[[autodoc]] XLNetConfig
+
+## XLNetTokenizer
+
+[[autodoc]] XLNetTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## XLNetTokenizerFast
+
+[[autodoc]] XLNetTokenizerFast
+
+## XLNet specific outputs
+
+[[autodoc]] models.xlnet.modeling_xlnet.XLNetModelOutput
+
+[[autodoc]] models.xlnet.modeling_xlnet.XLNetLMHeadModelOutput
+
+[[autodoc]] models.xlnet.modeling_xlnet.XLNetForSequenceClassificationOutput
+
+[[autodoc]] models.xlnet.modeling_xlnet.XLNetForMultipleChoiceOutput
+
+[[autodoc]] models.xlnet.modeling_xlnet.XLNetForTokenClassificationOutput
+
+[[autodoc]] models.xlnet.modeling_xlnet.XLNetForQuestionAnsweringSimpleOutput
+
+[[autodoc]] models.xlnet.modeling_xlnet.XLNetForQuestionAnsweringOutput
+
+[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetModelOutput
+
+[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetLMHeadModelOutput
+
+[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetForSequenceClassificationOutput
+
+[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetForMultipleChoiceOutput
+
+[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetForTokenClassificationOutput
+
+[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetForQuestionAnsweringSimpleOutput
+
+## XLNetModel
+
+[[autodoc]] XLNetModel
+    - forward
+
+## XLNetLMHeadModel
+
+[[autodoc]] XLNetLMHeadModel
+    - forward
+
+## XLNetForSequenceClassification
+
+[[autodoc]] XLNetForSequenceClassification
+    - forward
+
+## XLNetForMultipleChoice
+
+[[autodoc]] XLNetForMultipleChoice
+    - forward
+
+## XLNetForTokenClassification
+
+[[autodoc]] XLNetForTokenClassification
+    - forward
+
+## XLNetForQuestionAnsweringSimple
+
+[[autodoc]] XLNetForQuestionAnsweringSimple
+    - forward
+
+## XLNetForQuestionAnswering
+
+[[autodoc]] XLNetForQuestionAnswering
+    - forward
+
+## TFXLNetModel
+
+[[autodoc]] TFXLNetModel
+    - call
+
+## TFXLNetLMHeadModel
+
+[[autodoc]] TFXLNetLMHeadModel
+    - call
+
+## TFXLNetForSequenceClassification
+
+[[autodoc]] TFXLNetForSequenceClassification
+    - call
+
+## TFLNetForMultipleChoice
+
+[[autodoc]] TFXLNetForMultipleChoice
+    - call
+
+## TFXLNetForTokenClassification
+
+[[autodoc]] TFXLNetForTokenClassification
+    - call
+
+## TFXLNetForQuestionAnsweringSimple
+
+[[autodoc]] TFXLNetForQuestionAnsweringSimple
+    - call
diff --git a/docs/source/model_doc/xlnet.rst b/docs/source/model_doc/xlnet.rst
deleted file mode 100644
index 8d46935cdc..0000000000
--- a/docs/source/model_doc/xlnet.rst
+++ /dev/null
@@ -1,204 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-XLNet
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The XLNet model was proposed in `XLNet: Generalized Autoregressive Pretraining for Language Understanding
-<https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang, Zihang Dai, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov,
-Quoc V. Le. XLnet is an extension of the Transformer-XL model pre-trained using an autoregressive method to learn
-bidirectional contexts by maximizing the expected likelihood over all permutations of the input sequence factorization
-order.
-
-The abstract from the paper is the following:
-
-*With the capability of modeling bidirectional contexts, denoising autoencoding based pretraining like BERT achieves
-better performance than pretraining approaches based on autoregressive language modeling. However, relying on
-corrupting the input with masks, BERT neglects dependency between the masked positions and suffers from a
-pretrain-finetune discrepancy. In light of these pros and cons, we propose XLNet, a generalized autoregressive
-pretraining method that (1) enables learning bidirectional contexts by maximizing the expected likelihood over all
-permutations of the factorization order and (2) overcomes the limitations of BERT thanks to its autoregressive
-formulation. Furthermore, XLNet integrates ideas from Transformer-XL, the state-of-the-art autoregressive model, into
-pretraining. Empirically, under comparable experiment settings, XLNet outperforms BERT on 20 tasks, often by a large
-margin, including question answering, natural language inference, sentiment analysis, and document ranking.*
-
-Tips:
-
-- The specific attention pattern can be controlled at training and test time using the :obj:`perm_mask` input.
-- Due to the difficulty of training a fully auto-regressive model over various factorization order, XLNet is pretrained
-  using only a sub-set of the output tokens as target which are selected with the :obj:`target_mapping` input.
-- To use XLNet for sequential decoding (i.e. not in fully bi-directional setting), use the :obj:`perm_mask` and
-  :obj:`target_mapping` inputs to control the attention span and outputs (see examples in
-  `examples/pytorch/text-generation/run_generation.py`)
-- XLNet is one of the few models that has no sequence length limit.
-
-This model was contributed by `thomwolf <https://huggingface.co/thomwolf>`__. The original code can be found `here
-<https://github.com/zihangdai/xlnet/>`__.
-
-
-XLNetConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLNetConfig
-    :members:
-
-
-XLNetTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLNetTokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-XLNetTokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLNetTokenizerFast
-    :members:
-
-
-XLNet specific outputs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.models.xlnet.modeling_xlnet.XLNetModelOutput
-    :members:
-
-.. autoclass:: transformers.models.xlnet.modeling_xlnet.XLNetLMHeadModelOutput
-    :members:
-
-.. autoclass:: transformers.models.xlnet.modeling_xlnet.XLNetForSequenceClassificationOutput
-    :members:
-
-.. autoclass:: transformers.models.xlnet.modeling_xlnet.XLNetForMultipleChoiceOutput
-    :members:
-
-.. autoclass:: transformers.models.xlnet.modeling_xlnet.XLNetForTokenClassificationOutput
-    :members:
-
-.. autoclass:: transformers.models.xlnet.modeling_xlnet.XLNetForQuestionAnsweringSimpleOutput
-    :members:
-
-.. autoclass:: transformers.models.xlnet.modeling_xlnet.XLNetForQuestionAnsweringOutput
-    :members:
-
-.. autoclass:: transformers.models.xlnet.modeling_tf_xlnet.TFXLNetModelOutput
-    :members:
-
-.. autoclass:: transformers.models.xlnet.modeling_tf_xlnet.TFXLNetLMHeadModelOutput
-    :members:
-
-.. autoclass:: transformers.models.xlnet.modeling_tf_xlnet.TFXLNetForSequenceClassificationOutput
-    :members:
-
-.. autoclass:: transformers.models.xlnet.modeling_tf_xlnet.TFXLNetForMultipleChoiceOutput
-    :members:
-
-.. autoclass:: transformers.models.xlnet.modeling_tf_xlnet.TFXLNetForTokenClassificationOutput
-    :members:
-
-.. autoclass:: transformers.models.xlnet.modeling_tf_xlnet.TFXLNetForQuestionAnsweringSimpleOutput
-    :members:
-
-
-XLNetModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLNetModel
-    :members: forward
-
-
-XLNetLMHeadModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLNetLMHeadModel
-    :members: forward
-
-
-XLNetForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLNetForSequenceClassification
-    :members: forward
-
-
-XLNetForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLNetForMultipleChoice
-    :members: forward
-
-
-XLNetForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLNetForTokenClassification
-    :members: forward
-
-
-XLNetForQuestionAnsweringSimple
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLNetForQuestionAnsweringSimple
-    :members: forward
-
-
-XLNetForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLNetForQuestionAnswering
-    :members: forward
-
-
-TFXLNetModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLNetModel
-    :members: call
-
-
-TFXLNetLMHeadModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLNetLMHeadModel
-    :members: call
-
-
-TFXLNetForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLNetForSequenceClassification
-    :members: call
-
-
-TFLNetForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLNetForMultipleChoice
-    :members: call
-
-
-TFXLNetForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLNetForTokenClassification
-    :members: call
-
-
-TFXLNetForQuestionAnsweringSimple
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLNetForQuestionAnsweringSimple
-    :members: call
diff --git a/docs/source/model_doc/xls_r.rst b/docs/source/model_doc/xls_r.mdx
similarity index 56%
rename from docs/source/model_doc/xls_r.rst
rename to docs/source/model_doc/xls_r.mdx
index 889243541a..82a7e3b8af 100644
--- a/docs/source/model_doc/xls_r.rst
+++ b/docs/source/model_doc/xls_r.mdx
@@ -1,23 +1,20 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
 
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
 
-        http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
 
-XLS-R
------------------------------------------------------------------------------------------------------------------------
+# XLS-R
 
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+## Overview
 
-The XLS-R model was proposed in `XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale
-<https://arxiv.org/abs/2111.09296>`__ by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman
+The XLS-R model was proposed in [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman
 Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
 
 The abstract from the paper is the following:
@@ -37,11 +34,10 @@ Tips:
 
 - XLS-R is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
 - XLS-R model was trained using connectionist temporal classification (CTC) so the model output has to be decoded using
-  :class:`~transformers.Wav2Vec2CTCTokenizer`.
+  [`Wav2Vec2CTCTokenizer`].
 
 Relevant checkpoints can be found under https://huggingface.co/models?other=xls_r.
 
-XLS-R's architecture is based on the Wav2Vec2 model, so one can refer to :doc:`Wav2Vec2's documentation page
-<wav2vec2>`.
+XLS-R's architecture is based on the Wav2Vec2 model, so one can refer to [Wav2Vec2's documentation page](wav2vec2).
 
-The original code can be found `here <https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec>`__.
+The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec).
diff --git a/docs/source/model_doc/xlsr_wav2vec2.rst b/docs/source/model_doc/xlsr_wav2vec2.mdx
similarity index 52%
rename from docs/source/model_doc/xlsr_wav2vec2.rst
rename to docs/source/model_doc/xlsr_wav2vec2.mdx
index 623332813c..32229f28b1 100644
--- a/docs/source/model_doc/xlsr_wav2vec2.rst
+++ b/docs/source/model_doc/xlsr_wav2vec2.mdx
@@ -1,23 +1,20 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
 
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
 
-        http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
 
-XLSR-Wav2Vec2
------------------------------------------------------------------------------------------------------------------------
+# XLSR-Wav2Vec2
 
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+## Overview
 
-The XLSR-Wav2Vec2 model was proposed in `Unsupervised Cross-Lingual Representation Learning For Speech Recognition
-<https://arxiv.org/abs/2006.13979>`__ by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael
+The XLSR-Wav2Vec2 model was proposed in [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael
 Auli.
 
 The abstract from the paper is the following:
@@ -37,9 +34,8 @@ Tips:
 
 - XLSR-Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
 - XLSR-Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be
-  decoded using :class:`~transformers.Wav2Vec2CTCTokenizer`.
+  decoded using [`Wav2Vec2CTCTokenizer`].
 
-XLSR-Wav2Vec2's architecture is based on the Wav2Vec2 model, so one can refer to :doc:`Wav2Vec2's documentation page
-<wav2vec2>`.
+XLSR-Wav2Vec2's architecture is based on the Wav2Vec2 model, so one can refer to [Wav2Vec2's documentation page](wav2vec2).
 
-The original code can be found `here <https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec>`__.
+The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec).
diff --git a/docs/source/model_sharing.mdx b/docs/source/model_sharing.mdx
new file mode 100644
index 0000000000..d7f3e7be80
--- /dev/null
+++ b/docs/source/model_sharing.mdx
@@ -0,0 +1,395 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Model sharing and uploading
+
+In this page, we will show you how to share a model you have trained or fine-tuned on new data with the community on
+the [model hub](https://huggingface.co/models).
+
+<iframe width="560" height="315" src="https://www.youtube.com/embed/XvSGPZFEjDY" title="YouTube video player"
+frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
+picture-in-picture" allowfullscreen></iframe>
+
+<Tip>
+
+You will need to create an account on [huggingface.co](https://huggingface.co/join) for this.
+
+Optionally, you can join an existing organization or create a new one.
+
+</Tip>
+
+We have seen in the [training tutorial](training): how to fine-tune a model on a given task. You have probably
+done something similar on your task, either using the model `fit()` method, directly in your own training loop or using the
+[`Trainer`] class. Let's see how you can share the result on the
+[model hub](https://huggingface.co/models).
+
+## Model versioning
+
+Since version v3.5.0, the model hub has built-in model versioning based on git and git-lfs. It is based on the paradigm
+that one model *is* one repo.
+
+This allows:
+
+- built-in versioning
+- access control
+- scalability
+
+This is built around *revisions*, which is a way to pin a specific version of a model, using a commit hash, tag or
+branch.
+
+For instance:
+
+```python
+>>> model = AutoModel.from_pretrained(
+>>>     "julien-c/EsperBERTo-small",
+>>>     revision="v2.0.1" # tag name, or branch name, or commit hash
+>>> )
+```
+
+## Push your model from Python
+
+### Preparation
+
+The first step is to make sure your credentials to the hub are stored somewhere. This can be done in two ways. If you
+have access to a terminal, you cam just run the following command in the virtual environment where you installed 🤗
+Transformers:
+
+```bash
+huggingface-cli login
+```
+
+It will store your access token in the Hugging Face cache folder (by default `~/.cache/`).
+
+If you don't have an easy access to a terminal (for instance in a Colab session), you can find a token linked to your
+account by going on [huggingface.co](https://huggingface.co/), click on your avatar on the top left corner, then on
+*Edit profile* on the left, just beneath your profile picture. In the submenu *API Tokens*, you will find your API
+token that you can just copy.
+
+### Directly push your model to the hub
+
+<Youtube id="Z1-XMy-GNLQ"/>
+
+Once you have an API token (either stored in the cache or copied and pasted in your notebook), you can directly push a
+finetuned model you saved in `save_directory` by calling:
+
+```python
+finetuned_model.push_to_hub("my-awesome-model")
+```
+
+If you have your API token not stored in the cache, you will need to pass it with `use_auth_token=your_token`.
+This is also be the case for all the examples below, so we won't mention it again.
+
+This will create a repository in your namespace name `my-awesome-model`, so anyone can now run:
+
+```python
+from transformers import AutoModel
+
+model = AutoModel.from_pretrained("your_username/my-awesome-model")
+```
+
+Even better, you can combine this push to the hub with the call to `save_pretrained`:
+
+```python
+finetuned_model.save_pretrained(save_directory, push_to_hub=True, repo_name="my-awesome-model")
+```
+
+If you are a premium user and want your model to be private, just add `private=True` to this call.
+
+If you are a member of an organization and want to push it inside the namespace of the organization instead of yours,
+just add `organization=my_amazing_org`.
+
+### Add new files to your model repo
+
+Once you have pushed your model to the hub, you might want to add the tokenizer, or a version of your model for another
+framework (TensorFlow, PyTorch, Flax). This is super easy to do! Let's begin with the tokenizer. You can add it to the
+repo you created before like this
+
+```python
+tokenizer.push_to_hub("my-awesome-model")
+```
+
+If you know its URL (it should be `https://huggingface.co/username/repo_name`), you can also do:
+
+```python
+tokenizer.push_to_hub(repo_url=my_repo_url)
+```
+
+And that's all there is to it! It's also a very easy way to fix a mistake if one of the files online had a bug.
+
+To add a model for another backend, it's also super easy. Let's say you have fine-tuned a TensorFlow model and want to
+add the pytorch model files to your model repo, so that anyone in the community can use it. The following allows you to
+directly create a PyTorch version of your TensorFlow model:
+
+```python
+from transformers import AutoModel
+
+model = AutoModel.from_pretrained(save_directory, from_tf=True)
+```
+
+You can also replace `save_directory` by the identifier of your model (`username/repo_name`) if you don't
+have a local save of it anymore. Then, just do the same as before:
+
+```python
+model.push_to_hub("my-awesome-model")
+```
+
+or
+
+```python
+model.push_to_hub(repo_url=my_repo_url)
+```
+
+## Use your terminal and git
+
+<Youtube id="rkCly_cbMBk"/>
+
+### Basic steps
+
+In order to upload a model, you'll need to first create a git repo. This repo will live on the model hub, allowing
+users to clone it and you (and your organization members) to push to it.
+
+You can create a model repo directly from [the /new page on the website](https://huggingface.co/new).
+
+Alternatively, you can use the `transformers-cli`. The next steps describe that process:
+
+Go to a terminal and run the following command. It should be in the virtual environment where you installed 🤗
+Transformers, since that command `transformers-cli` comes from the library.
+
+```bash
+transformers-cli login
+```
+
+Once you are logged in with your model hub credentials, you can start building your repositories. To create a repo:
+
+```bash
+transformers-cli repo create your-model-name
+```
+
+If you want to create a repo under a specific organization, you should add a *--organization* flag:
+
+```bash
+transformers-cli repo create your-model-name --organization your-org-name
+```
+
+This creates a repo on the model hub, which can be cloned.
+
+```bash
+# Make sure you have git-lfs installed
+# (https://git-lfs.github.com/)
+git lfs install
+
+git clone https://huggingface.co/username/your-model-name
+```
+
+When you have your local clone of your repo and lfs installed, you can then add/remove from that clone as you would
+with any other git repo.
+
+```bash
+# Commit as usual
+cd your-model-name
+echo "hello" >> README.md
+git add . && git commit -m "Update from $USER"
+```
+
+We are intentionally not wrapping git too much, so that you can go on with the workflow you're used to and the tools
+you already know.
+
+The only learning curve you might have compared to regular git is the one for git-lfs. The documentation at
+[git-lfs.github.com](https://git-lfs.github.com/) is decent, but we'll work on a tutorial with some tips and tricks
+in the coming weeks!
+
+Additionally, if you want to change multiple repos at once, the [change_config.py script](https://github.com/huggingface/efficient_scripts/blob/main/change_config.py) can probably save you some time.
+
+### Make your model work on all frameworks
+
+<!--TODO Sylvain: make this automatic during the upload
+-->
+
+You probably have your favorite framework, but so will other users! That's why it's best to upload your model with both
+PyTorch *and* TensorFlow checkpoints to make it easier to use (if you skip this step, users will still be able to load
+your model in another framework, but it will be slower, as it will have to be converted on the fly). Don't worry, it's
+super easy to do (and in a future version, it might all be automatic). You will need to install both PyTorch and
+TensorFlow for this step, but you don't need to worry about the GPU, so it should be very easy. Check the [TensorFlow
+installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or the [PyTorch
+installation page](https://pytorch.org/get-started/locally/#start-locally) to see how.
+
+First check that your model class exists in the other framework, that is try to import the same model by either adding
+or removing TF. For instance, if you trained a [`DistilBertForSequenceClassification`], try to type
+
+```python
+>>> from transformers import TFDistilBertForSequenceClassification
+```
+
+and if you trained a [`TFDistilBertForSequenceClassification`], try to type
+
+```python
+>>> from transformers import DistilBertForSequenceClassification
+```
+
+This will give back an error if your model does not exist in the other framework (something that should be pretty rare
+since we're aiming for full parity between the two frameworks). In this case, skip this and go to the next step.
+
+Now, if you trained your model in PyTorch and have to create a TensorFlow version, adapt the following code to your
+model class:
+
+```python
+>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True)
+>>> tf_model.save_pretrained("path/to/awesome-name-you-picked")
+```
+
+and if you trained your model in TensorFlow and have to create a PyTorch version, adapt the following code to your
+model class:
+
+```python
+>>> pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True)
+>>> pt_model.save_pretrained("path/to/awesome-name-you-picked")
+```
+
+That's all there is to it!
+
+### Check the directory before pushing to the model hub.
+
+Make sure there are no garbage files in the directory you'll upload. It should only have:
+
+- a *config.json* file, which saves the [configuration](main_classes/configuration) of your model ;
+- a *pytorch_model.bin* file, which is the PyTorch checkpoint (unless you can't have it for some reason) ;
+- a *tf_model.h5* file, which is the TensorFlow checkpoint (unless you can't have it for some reason) ;
+- a *special_tokens_map.json*, which is part of your [tokenizer](main_classes/tokenizer) save;
+- a *tokenizer_config.json*, which is part of your [tokenizer](main_classes/tokenizer) save;
+- files named *vocab.json*, *vocab.txt*, *merges.txt*, or similar, which contain the vocabulary of your tokenizer, part
+  of your [tokenizer](main_classes/tokenizer) save;
+- maybe a *added_tokens.json*, which is part of your [tokenizer](main_classes/tokenizer) save.
+
+Other files can safely be deleted.
+
+
+## Uploading your files
+
+Once the repo is cloned, you can add the model, configuration and tokenizer files. For instance, saving the model and
+tokenizer files:
+
+```python
+>>> model.save_pretrained("path/to/repo/clone/your-model-name")
+>>> tokenizer.save_pretrained("path/to/repo/clone/your-model-name")
+```
+
+Or, if you're using the Trainer API
+
+```python
+>>> trainer.save_model("path/to/awesome-name-you-picked")
+>>> tokenizer.save_pretrained("path/to/repo/clone/your-model-name")
+```
+
+You can then add these files to the staging environment and verify that they have been correctly staged with the `git status` command:
+
+```bash
+git add --all
+git status
+```
+
+Finally, the files should be committed:
+
+```bash
+git commit -m "First version of the your-model-name model and tokenizer."
+```
+
+And pushed to the remote:
+
+```bash
+git push
+```
+
+This will upload the folder containing the weights, tokenizer and configuration we have just prepared.
+
+
+### Add a model card
+
+To make sure everyone knows what your model can do, what its limitations, potential bias or ethical considerations are,
+please add a README.md model card to your model repo. You can just create it, or there's also a convenient button
+titled "Add a README.md" on your model page. A model card documentation can be found [here](https://huggingface.co/docs/hub/model-repos) (meta-suggestions are welcome). model card template (meta-suggestions
+are welcome).
+
+<Tip>
+
+Model cards used to live in the 🤗 Transformers repo under *model_cards/*, but for consistency and scalability we
+migrated every model card from the repo to its corresponding huggingface.co model repo.
+
+</Tip>
+
+If your model is fine-tuned from another model coming from the model hub (all 🤗 Transformers pretrained models do),
+don't forget to link to its model card so that people can fully trace how your model was built.
+
+
+### Using your model
+
+Your model now has a page on huggingface.co/models 🔥
+
+Anyone can load it from code:
+
+```python
+>>> tokenizer = AutoTokenizer.from_pretrained("namespace/awesome-name-you-picked")
+>>> model = AutoModel.from_pretrained("namespace/awesome-name-you-picked")
+```
+
+You may specify a revision by using the `revision` flag in the `from_pretrained` method:
+
+```python
+>>> tokenizer = AutoTokenizer.from_pretrained(
+>>>   "julien-c/EsperBERTo-small",
+>>>   revision="v2.0.1" # tag name, or branch name, or commit hash
+>>> )
+```
+
+## Workflow in a Colab notebook
+
+If you're in a Colab notebook (or similar) with no direct access to a terminal, here is the workflow you can use to
+upload your model. You can execute each one of them in a cell by adding a ! at the beginning.
+
+First you need to install *git-lfs* in the environment used by the notebook:
+
+```bash
+sudo apt-get install git-lfs
+```
+
+Then you can use either create a repo directly from [huggingface.co](https://huggingface.co/) , or use the
+`transformers-cli` to create it:
+
+
+```bash
+transformers-cli login
+transformers-cli repo create your-model-name
+```
+
+Once it's created, you can clone it and configure it (replace username by your username on huggingface.co):
+
+```bash
+git lfs install
+
+git clone https://username:password@huggingface.co/username/your-model-name
+# Alternatively if you have a token,
+# you can use it instead of your password
+git clone https://username:token@huggingface.co/username/your-model-name
+
+cd your-model-name
+git config --global user.email "email@example.com"
+# Tip: using the same email than for your huggingface.co account will link your commits to your profile
+git config --global user.name "Your name"
+```
+
+Once you've saved your model inside, and your clone is setup with the right remote URL, you can add it and push it with
+usual git commands.
+
+```bash
+git add .
+git commit -m "Initial commit"
+git push
+```
diff --git a/docs/source/model_sharing.rst b/docs/source/model_sharing.rst
deleted file mode 100644
index 345e0db1be..0000000000
--- a/docs/source/model_sharing.rst
+++ /dev/null
@@ -1,423 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Model sharing and uploading
-=======================================================================================================================
-
-In this page, we will show you how to share a model you have trained or fine-tuned on new data with the community on
-the `model hub <https://huggingface.co/models>`__.
-
-.. raw:: html
-
-   <iframe width="560" height="315" src="https://www.youtube.com/embed/XvSGPZFEjDY" title="YouTube video player"
-   frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
-   picture-in-picture" allowfullscreen></iframe>
-
-.. note::
-
-    You will need to create an account on `huggingface.co <https://huggingface.co/join>`__ for this.
-
-    Optionally, you can join an existing organization or create a new one.
-
-
-We have seen in the :doc:`training tutorial <training>`: how to fine-tune a model on a given task. You have probably
-done something similar on your task, either using the model directly in your own training loop or using the
-:class:`~.transformers.Trainer`/:class:`~.transformers.TFTrainer` class. Let's see how you can share the result on the
-`model hub <https://huggingface.co/models>`__.
-
-Model versioning
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Since version v3.5.0, the model hub has built-in model versioning based on git and git-lfs. It is based on the paradigm
-that one model *is* one repo.
-
-This allows:
-
-- built-in versioning
-- access control
-- scalability
-
-This is built around *revisions*, which is a way to pin a specific version of a model, using a commit hash, tag or
-branch.
-
-For instance:
-
-.. code-block::
-
-    >>> model = AutoModel.from_pretrained(
-    >>>   "julien-c/EsperBERTo-small",
-    >>>   revision="v2.0.1" # tag name, or branch name, or commit hash
-    >>> )
-
-
-Push your model from Python
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Preparation
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The first step is to make sure your credentials to the hub are stored somewhere. This can be done in two ways. If you
-have access to a terminal, you cam just run the following command in the virtual environment where you installed 🤗
-Transformers:
-
-.. code-block:: bash
-
-    transformers-cli login
-
-It will store your access token in the Hugging Face cache folder (by default :obj:`~/.cache/`).
-
-If you don't have an easy access to a terminal (for instance in a Colab session), you can find a token linked to your
-account by going on `huggingface.co <https://huggingface.co/>`, click on your avatar on the top left corner, then on
-`Edit profile` on the left, just beneath your profile picture. In the submenu `API Tokens`, you will find your API
-token that you can just copy.
-
-Directly push your model to the hub
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. raw:: html
-
-   <iframe width="560" height="315" src="https://www.youtube.com/embed/Z1-XMy-GNLQ" title="YouTube video player"
-   frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
-   picture-in-picture" allowfullscreen></iframe>
-
-Once you have an API token (either stored in the cache or copied and pasted in your notebook), you can directly push a
-finetuned model you saved in :obj:`save_directory` by calling:
-
-.. code-block:: python
-
-    finetuned_model.push_to_hub("my-awesome-model")
-
-If you have your API token not stored in the cache, you will need to pass it with :obj:`use_auth_token=your_token`.
-This is also be the case for all the examples below, so we won't mention it again.
-
-This will create a repository in your namespace name :obj:`my-awesome-model`, so anyone can now run:
-
-.. code-block:: python
-
-    from transformers import AutoModel
-
-    model = AutoModel.from_pretrained("your_username/my-awesome-model")
-
-Even better, you can combine this push to the hub with the call to :obj:`save_pretrained`:
-
-.. code-block:: python
-
-    finetuned_model.save_pretrained(save_directory, push_to_hub=True, repo_name="my-awesome-model")
-
-If you are a premium user and want your model to be private, just add :obj:`private=True` to this call.
-
-If you are a member of an organization and want to push it inside the namespace of the organization instead of yours,
-just add :obj:`organization=my_amazing_org`.
-
-Add new files to your model repo
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Once you have pushed your model to the hub, you might want to add the tokenizer, or a version of your model for another
-framework (TensorFlow, PyTorch, Flax). This is super easy to do! Let's begin with the tokenizer. You can add it to the
-repo you created before like this
-
-.. code-block:: python
-
-    tokenizer.push_to_hub("my-awesome-model")
-
-If you know its URL (it should be :obj:`https://huggingface.co/username/repo_name`), you can also do:
-
-.. code-block:: python
-
-    tokenizer.push_to_hub(repo_url=my_repo_url)
-
-And that's all there is to it! It's also a very easy way to fix a mistake if one of the files online had a bug.
-
-To add a model for another backend, it's also super easy. Let's say you have fine-tuned a TensorFlow model and want to
-add the pytorch model files to your model repo, so that anyone in the community can use it. The following allows you to
-directly create a PyTorch version of your TensorFlow model:
-
-.. code-block:: python
-
-    from transformers import AutoModel
-
-    model = AutoModel.from_pretrained(save_directory, from_tf=True)
-
-You can also replace :obj:`save_directory` by the identifier of your model (:obj:`username/repo_name`) if you don't
-have a local save of it anymore. Then, just do the same as before:
-
-.. code-block:: python
-
-    model.push_to_hub("my-awesome-model")
-
-or
-
-.. code-block:: python
-
-    model.push_to_hub(repo_url=my_repo_url)
-
-
-Use your terminal and git
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. raw:: html
-
-   <iframe width="560" height="315" src="https://www.youtube.com/embed/rkCly_cbMBk" title="YouTube video player"
-   frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
-   picture-in-picture" allowfullscreen></iframe>
-
-Basic steps
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-In order to upload a model, you'll need to first create a git repo. This repo will live on the model hub, allowing
-users to clone it and you (and your organization members) to push to it.
-
-You can create a model repo directly from `the /new page on the website <https://huggingface.co/new>`__.
-
-Alternatively, you can use the ``transformers-cli``. The next steps describe that process:
-
-Go to a terminal and run the following command. It should be in the virtual environment where you installed 🤗
-Transformers, since that command :obj:`transformers-cli` comes from the library.
-
-.. code-block:: bash
-
-    transformers-cli login
-
-
-Once you are logged in with your model hub credentials, you can start building your repositories. To create a repo:
-
-.. code-block:: bash
-
-    transformers-cli repo create your-model-name
-
-If you want to create a repo under a specific organization, you should add a `--organization` flag:
-
-.. code-block:: bash
-
-    transformers-cli repo create your-model-name --organization your-org-name
-
-This creates a repo on the model hub, which can be cloned.
-
-.. code-block:: bash
-
-    # Make sure you have git-lfs installed
-    # (https://git-lfs.github.com/)
-    git lfs install
-
-    git clone https://huggingface.co/username/your-model-name
-
-When you have your local clone of your repo and lfs installed, you can then add/remove from that clone as you would
-with any other git repo.
-
-.. code-block:: bash
-
-    # Commit as usual
-    cd your-model-name
-    echo "hello" >> README.md
-    git add . && git commit -m "Update from $USER"
-
-We are intentionally not wrapping git too much, so that you can go on with the workflow you're used to and the tools
-you already know.
-
-The only learning curve you might have compared to regular git is the one for git-lfs. The documentation at
-`git-lfs.github.com <https://git-lfs.github.com/>`__ is decent, but we'll work on a tutorial with some tips and tricks
-in the coming weeks!
-
-Additionally, if you want to change multiple repos at once, the `change_config.py script
-<https://github.com/huggingface/efficient_scripts/blob/main/change_config.py>`__ can probably save you some time.
-
-Make your model work on all frameworks
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. 
-    TODO Sylvain: make this automatic during the upload
-
-You probably have your favorite framework, but so will other users! That's why it's best to upload your model with both
-PyTorch `and` TensorFlow checkpoints to make it easier to use (if you skip this step, users will still be able to load
-your model in another framework, but it will be slower, as it will have to be converted on the fly). Don't worry, it's
-super easy to do (and in a future version, it might all be automatic). You will need to install both PyTorch and
-TensorFlow for this step, but you don't need to worry about the GPU, so it should be very easy. Check the `TensorFlow
-installation page <https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available>`__ and/or the `PyTorch
-installation page <https://pytorch.org/get-started/locally/#start-locally>`__ to see how.
-
-First check that your model class exists in the other framework, that is try to import the same model by either adding
-or removing TF. For instance, if you trained a :class:`~transformers.DistilBertForSequenceClassification`, try to type
-
-.. code-block::
-
-    >>> from transformers import TFDistilBertForSequenceClassification
-
-and if you trained a :class:`~transformers.TFDistilBertForSequenceClassification`, try to type
-
-.. code-block::
-
-    >>> from transformers import DistilBertForSequenceClassification
-
-This will give back an error if your model does not exist in the other framework (something that should be pretty rare
-since we're aiming for full parity between the two frameworks). In this case, skip this and go to the next step.
-
-Now, if you trained your model in PyTorch and have to create a TensorFlow version, adapt the following code to your
-model class:
-
-.. code-block::
-
-    >>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True)
-    >>> tf_model.save_pretrained("path/to/awesome-name-you-picked")
-
-and if you trained your model in TensorFlow and have to create a PyTorch version, adapt the following code to your
-model class:
-
-.. code-block::
-
-    >>> pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True)
-    >>> pt_model.save_pretrained("path/to/awesome-name-you-picked")
-
-That's all there is to it!
-
-Check the directory before pushing to the model hub.
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Make sure there are no garbage files in the directory you'll upload. It should only have:
-
-- a `config.json` file, which saves the :doc:`configuration <main_classes/configuration>` of your model ;
-- a `pytorch_model.bin` file, which is the PyTorch checkpoint (unless you can't have it for some reason) ;
-- a `tf_model.h5` file, which is the TensorFlow checkpoint (unless you can't have it for some reason) ;
-- a `special_tokens_map.json`, which is part of your :doc:`tokenizer <main_classes/tokenizer>` save;
-- a `tokenizer_config.json`, which is part of your :doc:`tokenizer <main_classes/tokenizer>` save;
-- files named `vocab.json`, `vocab.txt`, `merges.txt`, or similar, which contain the vocabulary of your tokenizer, part
-  of your :doc:`tokenizer <main_classes/tokenizer>` save;
-- maybe a `added_tokens.json`, which is part of your :doc:`tokenizer <main_classes/tokenizer>` save.
-
-Other files can safely be deleted.
-
-
-Uploading your files
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Once the repo is cloned, you can add the model, configuration and tokenizer files. For instance, saving the model and
-tokenizer files:
-
-.. code-block::
-
-    >>> model.save_pretrained("path/to/repo/clone/your-model-name")
-    >>> tokenizer.save_pretrained("path/to/repo/clone/your-model-name")
-
-Or, if you're using the Trainer API
-
-.. code-block::
-
-    >>> trainer.save_model("path/to/awesome-name-you-picked")
-    >>> tokenizer.save_pretrained("path/to/repo/clone/your-model-name")
-
-You can then add these files to the staging environment and verify that they have been correctly staged with the ``git
-status`` command:
-
-.. code-block:: bash
-
-    git add --all
-    git status
-
-Finally, the files should be committed:
-
-.. code-block:: bash
-
-    git commit -m "First version of the your-model-name model and tokenizer."
-
-And pushed to the remote:
-
-.. code-block:: bash
-
-    git push
-
-This will upload the folder containing the weights, tokenizer and configuration we have just prepared.
-
-
-Add a model card
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-To make sure everyone knows what your model can do, what its limitations, potential bias or ethical considerations are,
-please add a README.md model card to your model repo. You can just create it, or there's also a convenient button
-titled "Add a README.md" on your model page. A model card documentation can be found `here
-<https://huggingface.co/docs/hub/model-repos>`__ (meta-suggestions are welcome). model card template (meta-suggestions
-are welcome).
-
-.. note::
-
-    Model cards used to live in the 🤗 Transformers repo under `model_cards/`, but for consistency and scalability we
-    migrated every model card from the repo to its corresponding huggingface.co model repo.
-
-If your model is fine-tuned from another model coming from the model hub (all 🤗 Transformers pretrained models do),
-don't forget to link to its model card so that people can fully trace how your model was built.
-
-
-Using your model
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Your model now has a page on huggingface.co/models 🔥
-
-Anyone can load it from code:
-
-.. code-block::
-
-    >>> tokenizer = AutoTokenizer.from_pretrained("namespace/awesome-name-you-picked")
-    >>> model = AutoModel.from_pretrained("namespace/awesome-name-you-picked")
-
-
-You may specify a revision by using the ``revision`` flag in the ``from_pretrained`` method:
-
-.. code-block::
-
-    >>> tokenizer = AutoTokenizer.from_pretrained(
-    >>>   "julien-c/EsperBERTo-small",
-    >>>   revision="v2.0.1" # tag name, or branch name, or commit hash
-    >>> )
-
-Workflow in a Colab notebook
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-If you're in a Colab notebook (or similar) with no direct access to a terminal, here is the workflow you can use to
-upload your model. You can execute each one of them in a cell by adding a ! at the beginning.
-
-First you need to install `git-lfs` in the environment used by the notebook:
-
-.. code-block:: bash
-
-    sudo apt-get install git-lfs
-
-Then you can use either create a repo directly from `huggingface.co <https://huggingface.co/>`__ , or use the
-:obj:`transformers-cli` to create it:
-
-
-.. code-block:: bash
-
-    transformers-cli login
-    transformers-cli repo create your-model-name
-
-Once it's created, you can clone it and configure it (replace username by your username on huggingface.co):
-
-.. code-block:: bash
-
-    git lfs install
-
-    git clone https://username:password@huggingface.co/username/your-model-name
-    # Alternatively if you have a token,
-    # you can use it instead of your password
-    git clone https://username:token@huggingface.co/username/your-model-name
-
-    cd your-model-name
-    git config --global user.email "email@example.com"
-    # Tip: using the same email than for your huggingface.co account will link your commits to your profile
-    git config --global user.name "Your name"
-
-Once you've saved your model inside, and your clone is setup with the right remote URL, you can add it and push it with
-usual git commands.
-
-.. code-block:: bash
-
-    git add .
-    git commit -m "Initial commit"
-    git push
diff --git a/docs/source/model_summary.mdx b/docs/source/model_summary.mdx
new file mode 100644
index 0000000000..27522dfd8b
--- /dev/null
+++ b/docs/source/model_summary.mdx
@@ -0,0 +1,793 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Summary of the models
+
+This is a summary of the models available in 🤗 Transformers. It assumes you’re familiar with the original [transformer
+model](https://arxiv.org/abs/1706.03762). For a gentle introduction check the [annotated transformer](http://nlp.seas.harvard.edu/2018/04/03/attention.html). Here we focus on the high-level differences between the
+models. You can check them more in detail in their respective documentation. Also check out [the Model Hub](https://huggingface.co/models) where you can filter the checkpoints by model architecture.
+
+Each one of the models in the library falls into one of the following categories:
+
+- [autoregressive-models](#autoregressive-models)
+- [autoencoding-models](#autoencoding-models)
+- [seq-to-seq-models](#seq-to-seq-models)
+- [multimodal-models](#multimodal-models)
+- [retrieval-based-models](#retrieval-based-models)
+
+<iframe width="560" height="315" src="https://www.youtube.com/embed/H39Z_720T5s" title="YouTube video player"
+frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
+picture-in-picture" allowfullscreen></iframe>
+
+Autoregressive models are pretrained on the classic language modeling task: guess the next token having read all the
+previous ones. They correspond to the decoder of the original transformer model, and a mask is used on top of the full
+sentence so that the attention heads can only see what was before in the text, and not what’s after. Although those
+models can be fine-tuned and achieve great results on many tasks, the most natural application is text generation. A
+typical example of such models is GPT.
+
+Autoencoding models are pretrained by corrupting the input tokens in some way and trying to reconstruct the original
+sentence. They correspond to the encoder of the original transformer model in the sense that they get access to the
+full inputs without any mask. Those models usually build a bidirectional representation of the whole sentence. They can
+be fine-tuned and achieve great results on many tasks such as text generation, but their most natural application is
+sentence classification or token classification. A typical example of such models is BERT.
+
+Note that the only difference between autoregressive models and autoencoding models is in the way the model is
+pretrained. Therefore, the same architecture can be used for both autoregressive and autoencoding models. When a given
+model has been used for both types of pretraining, we have put it in the category corresponding to the article where it
+was first introduced.
+
+Sequence-to-sequence models use both the encoder and the decoder of the original transformer, either for translation
+tasks or by transforming other tasks to sequence-to-sequence problems. They can be fine-tuned to many tasks but their
+most natural applications are translation, summarization and question answering. The original transformer model is an
+example of such a model (only for translation), T5 is an example that can be fine-tuned on other tasks.
+
+Multimodal models mix text inputs with other kinds (e.g. images) and are more specific to a given task.
+
+<a id='autoregressive-models'></a>
+
+## Decoders or autoregressive models
+
+As mentioned before, these models rely on the decoder part of the original transformer and use an attention mask so
+that at each position, the model can only look at the tokens before the attention heads.
+
+<Youtube id="d_ixlCubqQw"/>
+
+### Original GPT
+
+<a href="https://huggingface.co/models?filter=openai-gpt">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-openai--gpt-blueviolet">
+</a>
+<a href="model_doc/gpt">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-openai--gpt-blueviolet">
+</a>
+
+[Improving Language Understanding by Generative Pre-Training](https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf), Alec Radford et al.
+
+The first autoregressive model based on the transformer architecture, pretrained on the Book Corpus dataset.
+
+The library provides versions of the model for language modeling and multitask language modeling/multiple choice
+classification.
+
+### GPT-2
+
+<a href="https://huggingface.co/models?filter=gpt2">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-gpt2-blueviolet">
+</a>
+<a href="model_doc/gpt2">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-gpt2-blueviolet">
+</a>
+
+[Language Models are Unsupervised Multitask Learners](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf),
+Alec Radford et al.
+
+A bigger and better version of GPT, pretrained on WebText (web pages from outgoing links in Reddit with 3 karmas or
+more).
+
+The library provides versions of the model for language modeling and multitask language modeling/multiple choice
+classification.
+
+### CTRL
+
+<a href="https://huggingface.co/models?filter=ctrl">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-ctrl-blueviolet">
+</a>
+<a href="model_doc/ctrl">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-ctrl-blueviolet">
+</a>
+
+[CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858),
+Nitish Shirish Keskar et al.
+
+Same as the GPT model but adds the idea of control codes. Text is generated from a prompt (can be empty) and one (or
+several) of those control codes which are then used to influence the text generation: generate with the style of
+wikipedia article, a book or a movie review.
+
+The library provides a version of the model for language modeling only.
+
+### Transformer-XL
+
+<a href="https://huggingface.co/models?filter=transfo-xl">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-transfo--xl-blueviolet">
+</a>
+<a href="model_doc/transformerxl">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-transfo--xl-blueviolet">
+</a>
+
+[Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860), Zihang
+Dai et al.
+
+Same as a regular GPT model, but introduces a recurrence mechanism for two consecutive segments (similar to a regular
+RNNs with two consecutive inputs). In this context, a segment is a number of consecutive tokens (for instance 512) that
+may span across multiple documents, and segments are fed in order to the model.
+
+Basically, the hidden states of the previous segment are concatenated to the current input to compute the attention
+scores. This allows the model to pay attention to information that was in the previous segment as well as the current
+one. By stacking multiple attention layers, the receptive field can be increased to multiple previous segments.
+
+This changes the positional embeddings to positional relative embeddings (as the regular positional embeddings would
+give the same results in the current input and the current hidden state at a given position) and needs to make some
+adjustments in the way attention scores are computed.
+
+The library provides a version of the model for language modeling only.
+
+<a id='reformer'></a>
+
+### Reformer
+
+<a href="https://huggingface.co/models?filter=reformer">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-reformer-blueviolet">
+</a>
+<a href="model_doc/reformer">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-reformer-blueviolet">
+</a>
+
+[Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451), Nikita Kitaev et al .
+
+An autoregressive transformer model with lots of tricks to reduce memory footprint and compute time. Those tricks
+include:
+
+- Use [Axial position encoding](#axial-pos-encoding) (see below for more details). It’s a mechanism to avoid
+  having a huge positional encoding matrix (when the sequence length is very big) by factorizing it into smaller
+  matrices.
+- Replace traditional attention by [LSH (local-sensitive hashing) attention](#lsh-attention) (see below for more
+  details). It's a technique to avoid computing the full product query-key in the attention layers.
+- Avoid storing the intermediate results of each layer by using reversible transformer layers to obtain them during
+  the backward pass (subtracting the residuals from the input of the next layer gives them back) or recomputing them
+  for results inside a given layer (less efficient than storing them but saves memory).
+- Compute the feedforward operations by chunks and not on the whole batch.
+
+With those tricks, the model can be fed much larger sentences than traditional transformer autoregressive models.
+
+<Tip>
+
+This model could be very well be used in an autoencoding setting, there is no checkpoint for such a
+pretraining yet, though.
+
+</Tip>
+
+The library provides a version of the model for language modeling only.
+
+### XLNet
+
+<a href="https://huggingface.co/models?filter=xlnet">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-xlnet-blueviolet">
+</a>
+<a href="model_doc/xlnet">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-xlnet-blueviolet">
+</a>
+
+[XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237), Zhilin
+Yang et al.
+
+XLNet is not a traditional autoregressive model but uses a training strategy that builds on that. It permutes the
+tokens in the sentence, then allows the model to use the last n tokens to predict the token n+1. Since this is all done
+with a mask, the sentence is actually fed in the model in the right order, but instead of masking the first n tokens
+for n+1, XLNet uses a mask that hides the previous tokens in some given permutation of 1,...,sequence length.
+
+XLNet also uses the same recurrence mechanism as Transformer-XL to build long-term dependencies.
+
+The library provides a version of the model for language modeling, token classification, sentence classification,
+multiple choice classification and question answering.
+
+<a id='autoencoding-models'></a>
+
+## Encoders or autoencoding models
+
+As mentioned before, these models rely on the encoder part of the original transformer and use no mask so the model can
+look at all the tokens in the attention heads. For pretraining, targets are the original sentences and inputs are their
+corrupted versions.
+
+<Youtube id="MUqNwgPjJvQ"/>
+
+### BERT
+
+<a href="https://huggingface.co/models?filter=bert">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-bert-blueviolet">
+</a>
+<a href="model_doc/bert">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-bert-blueviolet">
+</a>
+
+[BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805),
+Jacob Devlin et al.
+
+Corrupts the inputs by using random masking, more precisely, during pretraining, a given percentage of tokens (usually
+15%) is masked by:
+
+- a special mask token with probability 0.8
+- a random token different from the one masked with probability 0.1
+- the same token with probability 0.1
+
+The model must predict the original sentence, but has a second objective: inputs are two sentences A and B (with a
+separation token in between). With probability 50%, the sentences are consecutive in the corpus, in the remaining 50%
+they are not related. The model has to predict if the sentences are consecutive or not.
+
+The library provides a version of the model for language modeling (traditional or masked), next sentence prediction,
+token classification, sentence classification, multiple choice classification and question answering.
+
+### ALBERT
+
+<a href="https://huggingface.co/models?filter=albert">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-albert-blueviolet">
+</a>
+<a href="model_doc/albert">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-albert-blueviolet">
+</a>
+
+[ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942),
+Zhenzhong Lan et al.
+
+Same as BERT but with a few tweaks:
+
+- Embedding size E is different from hidden size H justified because the embeddings are context independent (one
+  embedding vector represents one token), whereas hidden states are context dependent (one hidden state represents a
+  sequence of tokens) so it's more logical to have H >> E. Also, the embedding matrix is large since it's V x E (V
+  being the vocab size). If E < H, it has less parameters.
+- Layers are split in groups that share parameters (to save memory).
+- Next sentence prediction is replaced by a sentence ordering prediction: in the inputs, we have two sentences A and
+  B (that are consecutive) and we either feed A followed by B or B followed by A. The model must predict if they have
+  been swapped or not.
+
+The library provides a version of the model for masked language modeling, token classification, sentence
+classification, multiple choice classification and question answering.
+
+### RoBERTa
+
+<a href="https://huggingface.co/models?filter=roberta">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-roberta-blueviolet">
+</a>
+<a href="model_doc/roberta">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-roberta-blueviolet">
+</a>
+
+[RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692), Yinhan Liu et al.
+
+Same as BERT with better pretraining tricks:
+
+- dynamic masking: tokens are masked differently at each epoch, whereas BERT does it once and for all
+- no NSP (next sentence prediction) loss and instead of putting just two sentences together, put a chunk of
+  contiguous texts together to reach 512 tokens (so the sentences are in an order than may span several documents)
+- train with larger batches
+- use BPE with bytes as a subunit and not characters (because of unicode characters)
+
+The library provides a version of the model for masked language modeling, token classification, sentence
+classification, multiple choice classification and question answering.
+
+### DistilBERT
+
+<a href="https://huggingface.co/models?filter=distilbert">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-distilbert-blueviolet">
+</a>
+<a href="model_doc/distilbert">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-distilbert-blueviolet">
+</a>
+
+[DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108),
+Victor Sanh et al.
+
+Same as BERT but smaller. Trained by distillation of the pretrained BERT model, meaning it's been trained to predict
+the same probabilities as the larger model. The actual objective is a combination of:
+
+- finding the same probabilities as the teacher model
+- predicting the masked tokens correctly (but no next-sentence objective)
+- a cosine similarity between the hidden states of the student and the teacher model
+
+The library provides a version of the model for masked language modeling, token classification, sentence classification
+and question answering.
+
+### ConvBERT
+
+<a href="https://huggingface.co/models?filter=convbert">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-convbert-blueviolet">
+</a>
+<a href="model_doc/convbert">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-convbert-blueviolet">
+</a>
+
+[ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496), Zihang Jiang,
+Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
+
+Pre-trained language models like BERT and its variants have recently achieved impressive performance in various natural
+language understanding tasks. However, BERT heavily relies on the global self-attention block and thus suffers large
+memory footprint and computation cost. Although all its attention heads query on the whole input sequence for
+generating the attention map from a global perspective, we observe some heads only need to learn local dependencies,
+which means the existence of computation redundancy. We therefore propose a novel span-based dynamic convolution to
+replace these self-attention heads to directly model local dependencies. The novel convolution heads, together with the
+rest self-attention heads, form a new mixed attention block that is more efficient at both global and local context
+learning. We equip BERT with this mixed attention design and build a ConvBERT model. Experiments have shown that
+ConvBERT significantly outperforms BERT and its variants in various downstream tasks, with lower training cost and
+fewer model parameters. Remarkably, ConvBERTbase model achieves 86.4 GLUE score, 0.7 higher than ELECTRAbase, while
+using less than 1/4 training cost.
+
+The library provides a version of the model for masked language modeling, token classification, sentence classification
+and question answering.
+
+### XLM
+
+<a href="https://huggingface.co/models?filter=xlm">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-xlm-blueviolet">
+</a>
+<a href="model_doc/xlm">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-xlm-blueviolet">
+</a>
+
+[Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291), Guillaume Lample and Alexis Conneau
+
+A transformer model trained on several languages. There are three different type of training for this model and the
+library provides checkpoints for all of them:
+
+- Causal language modeling (CLM) which is the traditional autoregressive training (so this model could be in the
+  previous section as well). One of the languages is selected for each training sample, and the model input is a
+  sentence of 256 tokens, that may span over several documents in one of those languages.
+- Masked language modeling (MLM) which is like RoBERTa. One of the languages is selected for each training sample,
+  and the model input is a sentence of 256 tokens, that may span over several documents in one of those languages,
+  with dynamic masking of the tokens.
+- A combination of MLM and translation language modeling (TLM). This consists of concatenating a sentence in two
+  different languages, with random masking. To predict one of the masked tokens, the model can use both, the
+  surrounding context in language 1 and the context given by language 2.
+
+Checkpoints refer to which method was used for pretraining by having *clm*, *mlm* or *mlm-tlm* in their names. On top
+of positional embeddings, the model has language embeddings. When training using MLM/CLM, this gives the model an
+indication of the language used, and when training using MLM+TLM, an indication of the language used for each part.
+
+The library provides a version of the model for language modeling, token classification, sentence classification and
+question answering.
+
+### XLM-RoBERTa
+
+<a href="https://huggingface.co/models?filter=xlm-roberta">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-xlm--roberta-blueviolet">
+</a>
+<a href="model_doc/xlmroberta">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-xlm--roberta-blueviolet">
+</a>
+
+[Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116), Alexis Conneau et
+al.
+
+Uses RoBERTa tricks on the XLM approach, but does not use the translation language modeling objective. It only uses
+masked language modeling on sentences coming from one language. However, the model is trained on many more languages
+(100) and doesn't use the language embeddings, so it's capable of detecting the input language by itself.
+
+The library provides a version of the model for masked language modeling, token classification, sentence
+classification, multiple choice classification and question answering.
+
+### FlauBERT
+
+<a href="https://huggingface.co/models?filter=flaubert">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-flaubert-blueviolet">
+</a>
+<a href="model_doc/flaubert">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-flaubert-blueviolet">
+</a>
+
+[FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372), Hang Le et al.
+
+Like RoBERTa, without the sentence ordering prediction (so just trained on the MLM objective).
+
+The library provides a version of the model for language modeling and sentence classification.
+
+### ELECTRA
+
+<a href="https://huggingface.co/models?filter=electra">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-electra-blueviolet">
+</a>
+<a href="model_doc/electra">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-electra-blueviolet">
+</a>
+
+[ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators](https://arxiv.org/abs/2003.10555),
+Kevin Clark et al.
+
+ELECTRA is a transformer model pretrained with the use of another (small) masked language model. The inputs are
+corrupted by that language model, which takes an input text that is randomly masked and outputs a text in which ELECTRA
+has to predict which token is an original and which one has been replaced. Like for GAN training, the small language
+model is trained for a few steps (but with the original texts as objective, not to fool the ELECTRA model like in a
+traditional GAN setting) then the ELECTRA model is trained for a few steps.
+
+The library provides a version of the model for masked language modeling, token classification and sentence
+classification.
+
+### Funnel Transformer
+
+<a href="https://huggingface.co/models?filter=funnel">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-funnel-blueviolet">
+</a>
+<a href="model_doc/funnel">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-funnel-blueviolet">
+</a>
+
+[Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236), Zihang Dai et al.
+
+Funnel Transformer is a transformer model using pooling, a bit like a ResNet model: layers are grouped in blocks, and
+at the beginning of each block (except the first one), the hidden states are pooled among the sequence dimension. This
+way, their length is divided by 2, which speeds up the computation of the next hidden states. All pretrained models
+have three blocks, which means the final hidden state has a sequence length that is one fourth of the original sequence
+length.
+
+For tasks such as classification, this is not a problem, but for tasks like masked language modeling or token
+classification, we need a hidden state with the same sequence length as the original input. In those cases, the final
+hidden states are upsampled to the input sequence length and go through two additional layers. That's why there are two
+versions of each checkpoint. The version suffixed with "-base" contains only the three blocks, while the version
+without that suffix contains the three blocks and the upsampling head with its additional layers.
+
+The pretrained models available use the same pretraining objective as ELECTRA.
+
+The library provides a version of the model for masked language modeling, token classification, sentence
+classification, multiple choice classification and question answering.
+
+<a id='longformer'></a>
+
+### Longformer
+
+<a href="https://huggingface.co/models?filter=longformer">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-longformer-blueviolet">
+</a>
+<a href="model_doc/longformer">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-longformer-blueviolet">
+</a>
+
+[Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150), Iz Beltagy et al.
+
+A transformer model replacing the attention matrices by sparse matrices to go faster. Often, the local context (e.g.,
+what are the two tokens left and right?) is enough to take action for a given token. Some preselected input tokens are
+still given global attention, but the attention matrix has way less parameters, resulting in a speed-up. See the
+[local attention section](#local-attention) for more information.
+
+It is pretrained the same way a RoBERTa otherwise.
+
+<Tip>
+
+This model could be very well be used in an autoregressive setting, there is no checkpoint for such a
+pretraining yet, though.
+
+</Tip>
+
+The library provides a version of the model for masked language modeling, token classification, sentence
+classification, multiple choice classification and question answering.
+
+<a id='seq-to-seq-models'></a>
+
+## Sequence-to-sequence models
+
+As mentioned before, these models keep both the encoder and the decoder of the original transformer.
+
+<Youtube id="0_4KEb08xrE"/>
+
+### BART
+
+<a href="https://huggingface.co/models?filter=bart">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-bart-blueviolet">
+</a>
+<a href="model_doc/bart">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-bart-blueviolet">
+</a>
+
+[BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461), Mike Lewis et al.
+
+Sequence-to-sequence model with an encoder and a decoder. Encoder is fed a corrupted version of the tokens, decoder is
+fed the original tokens (but has a mask to hide the future words like a regular transformers decoder). A composition of
+the following transformations are applied on the pretraining tasks for the encoder:
+
+- mask random tokens (like in BERT)
+- delete random tokens
+- mask a span of k tokens with a single mask token (a span of 0 tokens is an insertion of a mask token)
+- permute sentences
+- rotate the document to make it start at a specific token
+
+The library provides a version of this model for conditional generation and sequence classification.
+
+### Pegasus
+
+<a href="https://huggingface.co/models?filter=pegasus">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-pegasus-blueviolet">
+</a>
+<a href="model_doc/pegasus">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-pegasus-blueviolet">
+</a>
+
+[PEGASUS: Pre-training with Extracted Gap-sentences forAbstractive Summarization](https://arxiv.org/pdf/1912.08777.pdf), Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu on Dec 18, 2019.
+
+Sequence-to-sequence model with the same encoder-decoder model architecture as BART. Pegasus is pre-trained jointly on
+two self-supervised objective functions: Masked Language Modeling (MLM) and a novel summarization specific pretraining
+objective, called Gap Sentence Generation (GSG).
+
+- MLM: encoder input tokens are randomly replaced by a mask tokens and have to be predicted by the encoder (like in
+  BERT)
+- GSG: whole encoder input sentences are replaced by a second mask token and fed to the decoder, but which has a
+  causal mask to hide the future words like a regular auto-regressive transformer decoder.
+
+In contrast to BART, Pegasus' pretraining task is intentionally similar to summarization: important sentences are
+masked and are generated together as one output sequence from the remaining sentences, similar to an extractive
+summary.
+
+The library provides a version of this model for conditional generation, which should be used for summarization.
+
+
+### MarianMT
+
+<a href="https://huggingface.co/models?filter=marian">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-marian-blueviolet">
+</a>
+<a href="model_doc/marian">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-marian-blueviolet">
+</a>
+
+[Marian: Fast Neural Machine Translation in C++](https://arxiv.org/abs/1804.00344), Marcin Junczys-Dowmunt et al.
+
+A framework for translation models, using the same models as BART
+
+The library provides a version of this model for conditional generation.
+
+
+### T5
+
+<a href="https://huggingface.co/models?filter=t5">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-t5-blueviolet">
+</a>
+<a href="model_doc/t5">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-t5-blueviolet">
+</a>
+
+[Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683), Colin Raffel et al.
+
+Uses the traditional transformer model (with a slight change in the positional embeddings, which are learned at each
+layer). To be able to operate on all NLP tasks, it transforms them into text-to-text problems by using specific
+prefixes: “summarize: ”, “question: ”, “translate English to German: ” and so forth.
+
+The pretraining includes both supervised and self-supervised training. Supervised training is conducted on downstream
+tasks provided by the GLUE and SuperGLUE benchmarks (converting them into text-to-text tasks as explained above).
+
+Self-supervised training uses corrupted tokens, by randomly removing 15% of the tokens and replacing them with
+individual sentinel tokens (if several consecutive tokens are marked for removal, the whole group is replaced with a
+single sentinel token). The input of the encoder is the corrupted sentence, the input of the decoder is the original
+sentence and the target is then the dropped out tokens delimited by their sentinel tokens.
+
+For instance, if we have the sentence “My dog is very cute .”, and we decide to remove the tokens: "dog", "is" and
+"cute", the encoder input becomes “My <x> very <y> .” and the target input becomes “<x> dog is <y> cute .<z>”
+
+The library provides a version of this model for conditional generation.
+
+
+### MT5
+
+<a href="https://huggingface.co/models?filter=mt5">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-mt5-blueviolet">
+</a>
+<a href="model_doc/mt5">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-mt5-blueviolet">
+</a>
+
+[mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934), Linting Xue
+et al.
+
+The model architecture is same as T5. mT5's pretraining objective includes T5's self-supervised training, but not T5's
+supervised training. mT5 is trained on 101 languages.
+
+The library provides a version of this model for conditional generation.
+
+
+### MBart
+
+<a href="https://huggingface.co/models?filter=mbart">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-mbart-blueviolet">
+</a>
+<a href="model_doc/mbart">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-mbart-blueviolet">
+</a>
+
+[Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu,
+Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+
+The model architecture and pretraining objective is same as BART, but MBart is trained on 25 languages and is intended
+for supervised and unsupervised machine translation. MBart is one of the first methods for pretraining a complete
+sequence-to-sequence model by denoising full texts in multiple languages,
+
+The library provides a version of this model for conditional generation.
+
+The [mbart-large-en-ro checkpoint](https://huggingface.co/facebook/mbart-large-en-ro) can be used for english ->
+romanian translation.
+
+The [mbart-large-cc25](https://huggingface.co/facebook/mbart-large-cc25) checkpoint can be finetuned for other
+translation and summarization tasks, using code in ```examples/pytorch/translation/``` , but is not very useful without
+finetuning.
+
+
+### ProphetNet
+
+<a href="https://huggingface.co/models?filter=prophetnet">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-prophetnet-blueviolet">
+</a>
+<a href="model_doc/prophetnet">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-prophetnet-blueviolet">
+</a>
+
+[ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training,](https://arxiv.org/abs/2001.04063) by
+Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang, Ming Zhou.
+
+ProphetNet introduces a novel *sequence-to-sequence* pretraining objective, called *future n-gram prediction*. In
+future n-gram prediction, the model predicts the next n tokens simultaneously based on previous context tokens at each
+time step instead instead of just the single next token. The future n-gram prediction explicitly encourages the model
+to plan for the future tokens and prevent overfitting on strong local correlations. The model architecture is based on
+the original Transformer, but replaces the "standard" self-attention mechanism in the decoder by a a main
+self-attention mechanism and a self and n-stream (predict) self-attention mechanism.
+
+The library provides a pre-trained version of this model for conditional generation and a fine-tuned version for
+summarization.
+
+### XLM-ProphetNet
+
+<a href="https://huggingface.co/models?filter=xprophetnet">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-xprophetnet-blueviolet">
+</a>
+<a href="model_doc/xlmprophetnet">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-xprophetnet-blueviolet">
+</a>
+
+[ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training,](https://arxiv.org/abs/2001.04063) by
+Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang, Ming Zhou.
+
+XLM-ProphetNet's model architecture and pretraining objective is same as ProphetNet, but XLM-ProphetNet was pre-trained
+on the cross-lingual dataset [XGLUE](https://arxiv.org/abs/2004.01401).
+
+The library provides a pre-trained version of this model for multi-lingual conditional generation and fine-tuned
+versions for headline generation and question generation, respectively.
+
+<a id='multimodal-models'></a>
+
+## Multimodal models
+
+There is one multimodal model in the library which has not been pretrained in the self-supervised fashion like the
+others.
+
+### MMBT
+
+[Supervised Multimodal Bitransformers for Classifying Images and Text](https://arxiv.org/abs/1909.02950), Douwe Kiela
+et al.
+
+A transformers model used in multimodal settings, combining a text and an image to make predictions. The transformer
+model takes as inputs the embeddings of the tokenized text and the final activations of a pretrained on images resnet
+(after the pooling layer) that goes through a linear layer (to go from number of features at the end of the resnet to
+the hidden state dimension of the transformer).
+
+The different inputs are concatenated, and on top of the positional embeddings, a segment embedding is added to let the
+model know which part of the input vector corresponds to the text and which to the image.
+
+The pretrained model only works for classification.
+
+<!--More information in this [model documentation](model_doc/mmbt). TODO: write this page
+-->
+
+<a id='retrieval-based-models'></a>
+
+## Retrieval-based models
+
+Some models use documents retrieval during (pre)training and inference for open-domain question answering, for example.
+
+
+### DPR
+
+<a href="https://huggingface.co/models?filter=dpr">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-dpr-blueviolet">
+</a>
+<a href="model_doc/dpr">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-dpr-blueviolet">
+</a>
+
+[Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906), Vladimir Karpukhin et
+al.
+
+Dense Passage Retrieval (DPR) - is a set of tools and models for state-of-the-art open-domain question-answering
+research.
+
+
+DPR consists in three models:
+
+- Question encoder: encode questions as vectors
+- Context encoder: encode contexts as vectors
+- Reader: extract the answer of the questions inside retrieved contexts, along with a relevance score (high if the
+  inferred span actually answers the question).
+
+DPR's pipeline (not implemented yet) uses a retrieval step to find the top k contexts given a certain question, and
+then it calls the reader with the question and the retrieved documents to get the answer.
+
+### RAG
+
+<a href="https://huggingface.co/models?filter=rag">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-rag-blueviolet">
+</a>
+<a href="model_doc/rag">
+<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-rag-blueviolet">
+</a>
+
+[Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401), Patrick Lewis,
+Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau
+Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela
+
+Retrieval-augmented generation ("RAG") models combine the powers of pretrained dense retrieval (DPR) and Seq2Seq
+models. RAG models retrieve docs, pass them to a seq2seq model, then marginalize to generate outputs. The retriever and
+seq2seq modules are initialized from pretrained models, and fine-tuned jointly, allowing both retrieval and generation
+to adapt to downstream tasks.
+
+The two models RAG-Token and RAG-Sequence are available for generation.
+
+## More technical aspects
+
+### Full vs sparse attention
+
+Most transformer models use full attention in the sense that the attention matrix is square. It can be a big
+computational bottleneck when you have long texts. Longformer and reformer are models that try to be more efficient and
+use a sparse version of the attention matrix to speed up training.
+
+<a id='lsh-attention'></a>
+
+**LSH attention**
+
+[Reformer](#reformer) uses LSH attention. In the softmax(QK^t), only the biggest elements (in the softmax
+dimension) of the matrix QK^t are going to give useful contributions. So for each query q in Q, we can consider only
+the keys k in K that are close to q. A hash function is used to determine if q and k are close. The attention mask is
+modified to mask the current token (except at the first position), because it will give a query and a key equal (so
+very similar to each other). Since the hash can be a bit random, several hash functions are used in practice
+(determined by a n_rounds parameter) and then are averaged together.
+
+<a id='local-attention'></a>
+
+**Local attention**
+
+[Longformer](#longformer) uses local attention: often, the local context (e.g., what are the two tokens to the
+left and right?) is enough to take action for a given token. Also, by stacking attention layers that have a small
+window, the last layer will have a receptive field of more than just the tokens in the window, allowing them to build a
+representation of the whole sentence.
+
+Some preselected input tokens are also given global attention: for those few tokens, the attention matrix can access
+all tokens and this process is symmetric: all other tokens have access to those specific tokens (on top of the ones in
+their local window). This is shown in Figure 2d of the paper, see below for a sample attention mask:
+
+<img scale="50 %" align="center" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/local_attention_mask.png"/>
+
+Using those attention matrices with less parameters then allows the model to have inputs having a bigger sequence
+length.
+
+### Other tricks
+
+<a id='axial-pos-encoding'></a>
+
+**Axial positional encodings**
+
+[Reformer](#reformer) uses axial positional encodings: in traditional transformer models, the positional encoding
+E is a matrix of size \\(l\\) by \\(d\\), \\(l\\) being the sequence length and \\(d\\) the dimension of the
+hidden state. If you have very long texts, this matrix can be huge and take way too much space on the GPU. To alleviate
+that, axial positional encodings consist of factorizing that big matrix E in two smaller matrices E1 and E2, with
+dimensions \\(l_{1} \times d_{1}\\) and \\(l_{2} \times d_{2}\\), such that \\(l_{1} \times l_{2} = l\\) and
+\\(d_{1} + d_{2} = d\\) (with the product for the lengths, this ends up being way smaller). The embedding for time
+step \\(j\\) in E is obtained by concatenating the embeddings for timestep \\(j \% l1\\) in E1 and \\(j // l1\\)
+in E2.
diff --git a/docs/source/model_summary.rst b/docs/source/model_summary.rst
deleted file mode 100644
index 0bd42bb819..0000000000
--- a/docs/source/model_summary.rst
+++ /dev/null
@@ -1,901 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Summary of the models
-=======================================================================================================================
-
-This is a summary of the models available in 🤗 Transformers. It assumes you’re familiar with the original `transformer
-model <https://arxiv.org/abs/1706.03762>`_. For a gentle introduction check the `annotated transformer
-<http://nlp.seas.harvard.edu/2018/04/03/attention.html>`_. Here we focus on the high-level differences between the
-models. You can check them more in detail in their respective documentation. Also check out the :doc:`pretrained model
-page </pretrained_models>` to see the checkpoints available for each type of model and all `the community models
-<https://huggingface.co/models>`_.
-
-Each one of the models in the library falls into one of the following categories:
-
-  * :ref:`autoregressive-models`
-  * :ref:`autoencoding-models`
-  * :ref:`seq-to-seq-models`
-  * :ref:`multimodal-models`
-  * :ref:`retrieval-based-models`
-
-.. raw:: html
-
-   <iframe width="560" height="315" src="https://www.youtube.com/embed/H39Z_720T5s" title="YouTube video player"
-   frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
-   picture-in-picture" allowfullscreen></iframe>
-
-Autoregressive models are pretrained on the classic language modeling task: guess the next token having read all the
-previous ones. They correspond to the decoder of the original transformer model, and a mask is used on top of the full
-sentence so that the attention heads can only see what was before in the text, and not what’s after. Although those
-models can be fine-tuned and achieve great results on many tasks, the most natural application is text generation. A
-typical example of such models is GPT.
-
-Autoencoding models are pretrained by corrupting the input tokens in some way and trying to reconstruct the original
-sentence. They correspond to the encoder of the original transformer model in the sense that they get access to the
-full inputs without any mask. Those models usually build a bidirectional representation of the whole sentence. They can
-be fine-tuned and achieve great results on many tasks such as text generation, but their most natural application is
-sentence classification or token classification. A typical example of such models is BERT.
-
-Note that the only difference between autoregressive models and autoencoding models is in the way the model is
-pretrained. Therefore, the same architecture can be used for both autoregressive and autoencoding models. When a given
-model has been used for both types of pretraining, we have put it in the category corresponding to the article where it
-was first introduced.
-
-Sequence-to-sequence models use both the encoder and the decoder of the original transformer, either for translation
-tasks or by transforming other tasks to sequence-to-sequence problems. They can be fine-tuned to many tasks but their
-most natural applications are translation, summarization and question answering. The original transformer model is an
-example of such a model (only for translation), T5 is an example that can be fine-tuned on other tasks.
-
-Multimodal models mix text inputs with other kinds (e.g. images) and are more specific to a given task.
-
-.. _autoregressive-models:
-
-Decoders or autoregressive models
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-As mentioned before, these models rely on the decoder part of the original transformer and use an attention mask so
-that at each position, the model can only look at the tokens before the attention heads.
-
-.. raw:: html
-
-   <iframe width="560" height="315" src="https://www.youtube.com/embed/d_ixlCubqQw" title="YouTube video player"
-   frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
-   picture-in-picture" allowfullscreen></iframe>
-
-Original GPT
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=openai-gpt">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-openai--gpt-blueviolet">
-   </a>
-   <a href="model_doc/gpt">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-openai--gpt-blueviolet">
-   </a>
-
-`Improving Language Understanding by Generative Pre-Training
-<https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf>`_, Alec Radford et al.
-
-The first autoregressive model based on the transformer architecture, pretrained on the Book Corpus dataset.
-
-The library provides versions of the model for language modeling and multitask language modeling/multiple choice
-classification.
-
-GPT-2
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=gpt2">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-gpt2-blueviolet">
-   </a>
-   <a href="model_doc/gpt2">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-gpt2-blueviolet">
-   </a>
-
-`Language Models are Unsupervised Multitask Learners
-<https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf>`_,
-Alec Radford et al.
-
-A bigger and better version of GPT, pretrained on WebText (web pages from outgoing links in Reddit with 3 karmas or
-more).
-
-The library provides versions of the model for language modeling and multitask language modeling/multiple choice
-classification.
-
-CTRL
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=ctrl">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-ctrl-blueviolet">
-   </a>
-   <a href="model_doc/ctrl">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-ctrl-blueviolet">
-   </a>
-
-`CTRL: A Conditional Transformer Language Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`_,
-Nitish Shirish Keskar et al.
-
-Same as the GPT model but adds the idea of control codes. Text is generated from a prompt (can be empty) and one (or
-several) of those control codes which are then used to influence the text generation: generate with the style of
-wikipedia article, a book or a movie review.
-
-The library provides a version of the model for language modeling only.
-
-Transformer-XL
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=transfo-xl">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-transfo--xl-blueviolet">
-   </a>
-   <a href="model_doc/transformerxl">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-transfo--xl-blueviolet">
-   </a>
-
-`Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`_, Zihang
-Dai et al.
-
-Same as a regular GPT model, but introduces a recurrence mechanism for two consecutive segments (similar to a regular
-RNNs with two consecutive inputs). In this context, a segment is a number of consecutive tokens (for instance 512) that
-may span across multiple documents, and segments are fed in order to the model.
-
-Basically, the hidden states of the previous segment are concatenated to the current input to compute the attention
-scores. This allows the model to pay attention to information that was in the previous segment as well as the current
-one. By stacking multiple attention layers, the receptive field can be increased to multiple previous segments.
-
-This changes the positional embeddings to positional relative embeddings (as the regular positional embeddings would
-give the same results in the current input and the current hidden state at a given position) and needs to make some
-adjustments in the way attention scores are computed.
-
-The library provides a version of the model for language modeling only.
-
-.. _reformer:
-
-Reformer
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=reformer">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-reformer-blueviolet">
-   </a>
-   <a href="model_doc/reformer">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-reformer-blueviolet">
-   </a>
-
-`Reformer: The Efficient Transformer <https://arxiv.org/abs/2001.04451>`_, Nikita Kitaev et al .
-
-An autoregressive transformer model with lots of tricks to reduce memory footprint and compute time. Those tricks
-include:
-
-  * Use :ref:`Axial position encoding <axial-pos-encoding>` (see below for more details). It’s a mechanism to avoid
-    having a huge positional encoding matrix (when the sequence length is very big) by factorizing it into smaller
-    matrices.
-  * Replace traditional attention by :ref:`LSH (local-sensitive hashing) attention <lsh-attention>` (see below for more
-    details). It's a technique to avoid computing the full product query-key in the attention layers.
-  * Avoid storing the intermediate results of each layer by using reversible transformer layers to obtain them during
-    the backward pass (subtracting the residuals from the input of the next layer gives them back) or recomputing them
-    for results inside a given layer (less efficient than storing them but saves memory).
-  * Compute the feedforward operations by chunks and not on the whole batch.
-
-With those tricks, the model can be fed much larger sentences than traditional transformer autoregressive models.
-
-**Note:** This model could be very well be used in an autoencoding setting, there is no checkpoint for such a
-pretraining yet, though.
-
-The library provides a version of the model for language modeling only.
-
-XLNet
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=xlnet">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-xlnet-blueviolet">
-   </a>
-   <a href="model_doc/xlnet">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-xlnet-blueviolet">
-   </a>
-
-`XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_, Zhilin
-Yang et al.
-
-XLNet is not a traditional autoregressive model but uses a training strategy that builds on that. It permutes the
-tokens in the sentence, then allows the model to use the last n tokens to predict the token n+1. Since this is all done
-with a mask, the sentence is actually fed in the model in the right order, but instead of masking the first n tokens
-for n+1, XLNet uses a mask that hides the previous tokens in some given permutation of 1,...,sequence length.
-
-XLNet also uses the same recurrence mechanism as Transformer-XL to build long-term dependencies.
-
-The library provides a version of the model for language modeling, token classification, sentence classification,
-multiple choice classification and question answering.
-
-.. _autoencoding-models:
-
-Encoders or autoencoding models
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-As mentioned before, these models rely on the encoder part of the original transformer and use no mask so the model can
-look at all the tokens in the attention heads. For pretraining, targets are the original sentences and inputs are their
-corrupted versions.
-
-.. raw:: html
-
-   <iframe width="560" height="315" src="https://www.youtube.com/embed/MUqNwgPjJvQ" title="YouTube video player"
-   frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
-   picture-in-picture" allowfullscreen></iframe>
-
-BERT
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=bert">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-bert-blueviolet">
-   </a>
-   <a href="model_doc/bert">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-bert-blueviolet">
-   </a>
-
-`BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`_,
-Jacob Devlin et al.
-
-Corrupts the inputs by using random masking, more precisely, during pretraining, a given percentage of tokens (usually
-15%) is masked by:
-
-  * a special mask token with probability 0.8
-  * a random token different from the one masked with probability 0.1
-  * the same token with probability 0.1
-
-The model must predict the original sentence, but has a second objective: inputs are two sentences A and B (with a
-separation token in between). With probability 50%, the sentences are consecutive in the corpus, in the remaining 50%
-they are not related. The model has to predict if the sentences are consecutive or not.
-
-The library provides a version of the model for language modeling (traditional or masked), next sentence prediction,
-token classification, sentence classification, multiple choice classification and question answering.
-
-ALBERT
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=albert">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-albert-blueviolet">
-   </a>
-   <a href="model_doc/albert">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-albert-blueviolet">
-   </a>
-
-`ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_,
-Zhenzhong Lan et al.
-
-Same as BERT but with a few tweaks:
-
-  * Embedding size E is different from hidden size H justified because the embeddings are context independent (one
-    embedding vector represents one token), whereas hidden states are context dependent (one hidden state represents a
-    sequence of tokens) so it's more logical to have H >> E. Also, the embedding matrix is large since it's V x E (V
-    being the vocab size). If E < H, it has less parameters.
-  * Layers are split in groups that share parameters (to save memory).
-  * Next sentence prediction is replaced by a sentence ordering prediction: in the inputs, we have two sentences A and
-    B (that are consecutive) and we either feed A followed by B or B followed by A. The model must predict if they have
-    been swapped or not.
-
-The library provides a version of the model for masked language modeling, token classification, sentence
-classification, multiple choice classification and question answering.
-
-RoBERTa
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=roberta">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-roberta-blueviolet">
-   </a>
-   <a href="model_doc/roberta">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-roberta-blueviolet">
-   </a>
-
-`RoBERTa: A Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_, Yinhan Liu et al.
-
-Same as BERT with better pretraining tricks:
-
-  * dynamic masking: tokens are masked differently at each epoch, whereas BERT does it once and for all
-  * no NSP (next sentence prediction) loss and instead of putting just two sentences together, put a chunk of
-    contiguous texts together to reach 512 tokens (so the sentences are in an order than may span several documents)
-  * train with larger batches
-  * use BPE with bytes as a subunit and not characters (because of unicode characters)
-
-The library provides a version of the model for masked language modeling, token classification, sentence
-classification, multiple choice classification and question answering.
-
-DistilBERT
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=distilbert">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-distilbert-blueviolet">
-   </a>
-   <a href="model_doc/distilbert">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-distilbert-blueviolet">
-   </a>
-
-`DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`_,
-Victor Sanh et al.
-
-Same as BERT but smaller. Trained by distillation of the pretrained BERT model, meaning it's been trained to predict
-the same probabilities as the larger model. The actual objective is a combination of:
-
-  * finding the same probabilities as the teacher model
-  * predicting the masked tokens correctly (but no next-sentence objective)
-  * a cosine similarity between the hidden states of the student and the teacher model
-
-The library provides a version of the model for masked language modeling, token classification, sentence classification
-and question answering.
-
-ConvBERT
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=convbert">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-convbert-blueviolet">
-   </a>
-   <a href="model_doc/convbert">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-convbert-blueviolet">
-   </a>
-
-`ConvBERT: Improving BERT with Span-based Dynamic Convolution <https://arxiv.org/abs/2008.02496>`_, Zihang Jiang,
-Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-
-Pre-trained language models like BERT and its variants have recently achieved impressive performance in various natural
-language understanding tasks. However, BERT heavily relies on the global self-attention block and thus suffers large
-memory footprint and computation cost. Although all its attention heads query on the whole input sequence for
-generating the attention map from a global perspective, we observe some heads only need to learn local dependencies,
-which means the existence of computation redundancy. We therefore propose a novel span-based dynamic convolution to
-replace these self-attention heads to directly model local dependencies. The novel convolution heads, together with the
-rest self-attention heads, form a new mixed attention block that is more efficient at both global and local context
-learning. We equip BERT with this mixed attention design and build a ConvBERT model. Experiments have shown that
-ConvBERT significantly outperforms BERT and its variants in various downstream tasks, with lower training cost and
-fewer model parameters. Remarkably, ConvBERTbase model achieves 86.4 GLUE score, 0.7 higher than ELECTRAbase, while
-using less than 1/4 training cost.
-
-The library provides a version of the model for masked language modeling, token classification, sentence classification
-and question answering.
-
-XLM
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=xlm">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-xlm-blueviolet">
-   </a>
-   <a href="model_doc/xlm">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-xlm-blueviolet">
-   </a>
-
-`Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_, Guillaume Lample and Alexis Conneau
-
-A transformer model trained on several languages. There are three different type of training for this model and the
-library provides checkpoints for all of them:
-
-  * Causal language modeling (CLM) which is the traditional autoregressive training (so this model could be in the
-    previous section as well). One of the languages is selected for each training sample, and the model input is a
-    sentence of 256 tokens, that may span over several documents in one of those languages.
-  * Masked language modeling (MLM) which is like RoBERTa. One of the languages is selected for each training sample,
-    and the model input is a sentence of 256 tokens, that may span over several documents in one of those languages,
-    with dynamic masking of the tokens.
-  * A combination of MLM and translation language modeling (TLM). This consists of concatenating a sentence in two
-    different languages, with random masking. To predict one of the masked tokens, the model can use both, the
-    surrounding context in language 1 and the context given by language 2.
-
-Checkpoints refer to which method was used for pretraining by having `clm`, `mlm` or `mlm-tlm` in their names. On top
-of positional embeddings, the model has language embeddings. When training using MLM/CLM, this gives the model an
-indication of the language used, and when training using MLM+TLM, an indication of the language used for each part.
-
-The library provides a version of the model for language modeling, token classification, sentence classification and
-question answering.
-
-XLM-RoBERTa
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=xlm-roberta">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-xlm--roberta-blueviolet">
-   </a>
-   <a href="model_doc/xlmroberta">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-xlm--roberta-blueviolet">
-   </a>
-
-`Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`_, Alexis Conneau et
-al.
-
-Uses RoBERTa tricks on the XLM approach, but does not use the translation language modeling objective. It only uses
-masked language modeling on sentences coming from one language. However, the model is trained on many more languages
-(100) and doesn't use the language embeddings, so it's capable of detecting the input language by itself.
-
-The library provides a version of the model for masked language modeling, token classification, sentence
-classification, multiple choice classification and question answering.
-
-FlauBERT
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=flaubert">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-flaubert-blueviolet">
-   </a>
-   <a href="model_doc/flaubert">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-flaubert-blueviolet">
-   </a>
-
-`FlauBERT: Unsupervised Language Model Pre-training for French <https://arxiv.org/abs/1912.05372>`_, Hang Le et al.
-
-Like RoBERTa, without the sentence ordering prediction (so just trained on the MLM objective).
-
-The library provides a version of the model for language modeling and sentence classification.
-
-ELECTRA
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=electra">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-electra-blueviolet">
-   </a>
-   <a href="model_doc/electra">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-electra-blueviolet">
-   </a>
-
-`ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators <https://arxiv.org/abs/2003.10555>`_,
-Kevin Clark et al.
-
-ELECTRA is a transformer model pretrained with the use of another (small) masked language model. The inputs are
-corrupted by that language model, which takes an input text that is randomly masked and outputs a text in which ELECTRA
-has to predict which token is an original and which one has been replaced. Like for GAN training, the small language
-model is trained for a few steps (but with the original texts as objective, not to fool the ELECTRA model like in a
-traditional GAN setting) then the ELECTRA model is trained for a few steps.
-
-The library provides a version of the model for masked language modeling, token classification and sentence
-classification.
-
-Funnel Transformer
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=funnel">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-funnel-blueviolet">
-   </a>
-   <a href="model_doc/funnel">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-funnel-blueviolet">
-   </a>
-
-`Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing
-<https://arxiv.org/abs/2006.03236>`_, Zihang Dai et al.
-
-Funnel Transformer is a transformer model using pooling, a bit like a ResNet model: layers are grouped in blocks, and
-at the beginning of each block (except the first one), the hidden states are pooled among the sequence dimension. This
-way, their length is divided by 2, which speeds up the computation of the next hidden states. All pretrained models
-have three blocks, which means the final hidden state has a sequence length that is one fourth of the original sequence
-length.
-
-For tasks such as classification, this is not a problem, but for tasks like masked language modeling or token
-classification, we need a hidden state with the same sequence length as the original input. In those cases, the final
-hidden states are upsampled to the input sequence length and go through two additional layers. That's why there are two
-versions of each checkpoint. The version suffixed with "-base" contains only the three blocks, while the version
-without that suffix contains the three blocks and the upsampling head with its additional layers.
-
-The pretrained models available use the same pretraining objective as ELECTRA.
-
-The library provides a version of the model for masked language modeling, token classification, sentence
-classification, multiple choice classification and question answering.
-
-.. _longformer:
-
-Longformer
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=longformer">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-longformer-blueviolet">
-   </a>
-   <a href="model_doc/longformer">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-longformer-blueviolet">
-   </a>
-
-`Longformer: The Long-Document Transformer <https://arxiv.org/abs/2004.05150>`_, Iz Beltagy et al.
-
-A transformer model replacing the attention matrices by sparse matrices to go faster. Often, the local context (e.g.,
-what are the two tokens left and right?) is enough to take action for a given token. Some preselected input tokens are
-still given global attention, but the attention matrix has way less parameters, resulting in a speed-up. See the
-:ref:`local attention section <local-attention>` for more information.
-
-It is pretrained the same way a RoBERTa otherwise.
-
-**Note:** This model could be very well be used in an autoregressive setting, there is no checkpoint for such a
-pretraining yet, though.
-
-The library provides a version of the model for masked language modeling, token classification, sentence
-classification, multiple choice classification and question answering.
-
-.. _seq-to-seq-models:
-
-Sequence-to-sequence models
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-As mentioned before, these models keep both the encoder and the decoder of the original transformer.
-
-.. raw:: html
-
-   <iframe width="560" height="315" src="https://www.youtube.com/embed/0_4KEb08xrE" title="YouTube video player"
-   frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
-   picture-in-picture" allowfullscreen></iframe>
-
-BART
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=bart">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-bart-blueviolet">
-   </a>
-   <a href="model_doc/bart">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-bart-blueviolet">
-   </a>
-
-`BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension
-<https://arxiv.org/abs/1910.13461>`_, Mike Lewis et al.
-
-Sequence-to-sequence model with an encoder and a decoder. Encoder is fed a corrupted version of the tokens, decoder is
-fed the original tokens (but has a mask to hide the future words like a regular transformers decoder). A composition of
-the following transformations are applied on the pretraining tasks for the encoder:
-
-  * mask random tokens (like in BERT)
-  * delete random tokens
-  * mask a span of k tokens with a single mask token (a span of 0 tokens is an insertion of a mask token)
-  * permute sentences
-  * rotate the document to make it start at a specific token
-
-The library provides a version of this model for conditional generation and sequence classification.
-
-Pegasus
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=pegasus">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-pegasus-blueviolet">
-   </a>
-   <a href="model_doc/pegasus">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-pegasus-blueviolet">
-   </a>
-
-`PEGASUS: Pre-training with Extracted Gap-sentences forAbstractive Summarization
-<https://arxiv.org/pdf/1912.08777.pdf>`_, Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu on Dec 18, 2019.
-
-Sequence-to-sequence model with the same encoder-decoder model architecture as BART. Pegasus is pre-trained jointly on
-two self-supervised objective functions: Masked Language Modeling (MLM) and a novel summarization specific pretraining
-objective, called Gap Sentence Generation (GSG).
-
-  * MLM: encoder input tokens are randomly replaced by a mask tokens and have to be predicted by the encoder (like in
-    BERT)
-  * GSG: whole encoder input sentences are replaced by a second mask token and fed to the decoder, but which has a
-    causal mask to hide the future words like a regular auto-regressive transformer decoder.
-
-In contrast to BART, Pegasus' pretraining task is intentionally similar to summarization: important sentences are
-masked and are generated together as one output sequence from the remaining sentences, similar to an extractive
-summary.
-
-The library provides a version of this model for conditional generation, which should be used for summarization.
-
-
-MarianMT
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=marian">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-marian-blueviolet">
-   </a>
-   <a href="model_doc/marian">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-marian-blueviolet">
-   </a>
-
-`Marian: Fast Neural Machine Translation in C++ <https://arxiv.org/abs/1804.00344>`_, Marcin Junczys-Dowmunt et al.
-
-A framework for translation models, using the same models as BART
-
-The library provides a version of this model for conditional generation.
-
-
-T5
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=t5">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-t5-blueviolet">
-   </a>
-   <a href="model_doc/t5">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-t5-blueviolet">
-   </a>
-
-`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer
-<https://arxiv.org/abs/1910.10683>`_, Colin Raffel et al.
-
-Uses the traditional transformer model (with a slight change in the positional embeddings, which are learned at each
-layer). To be able to operate on all NLP tasks, it transforms them into text-to-text problems by using specific
-prefixes: “summarize: ”, “question: ”, “translate English to German: ” and so forth.
-
-The pretraining includes both supervised and self-supervised training. Supervised training is conducted on downstream
-tasks provided by the GLUE and SuperGLUE benchmarks (converting them into text-to-text tasks as explained above).
-
-Self-supervised training uses corrupted tokens, by randomly removing 15% of the tokens and replacing them with
-individual sentinel tokens (if several consecutive tokens are marked for removal, the whole group is replaced with a
-single sentinel token). The input of the encoder is the corrupted sentence, the input of the decoder is the original
-sentence and the target is then the dropped out tokens delimited by their sentinel tokens.
-
-For instance, if we have the sentence “My dog is very cute .”, and we decide to remove the tokens: "dog", "is" and
-"cute", the encoder input becomes “My <x> very <y> .” and the target input becomes “<x> dog is <y> cute .<z>”
-
-The library provides a version of this model for conditional generation.
-
-
-MT5
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=mt5">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-mt5-blueviolet">
-   </a>
-   <a href="model_doc/mt5">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-mt5-blueviolet">
-   </a>
-
-`mT5: A massively multilingual pre-trained text-to-text transformer <https://arxiv.org/abs/2010.11934>`_, Linting Xue
-et al.
-
-The model architecture is same as T5. mT5's pretraining objective includes T5's self-supervised training, but not T5's
-supervised training. mT5 is trained on 101 languages.
-
-The library provides a version of this model for conditional generation.
-
-
-MBart
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=mbart">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-mbart-blueviolet">
-   </a>
-   <a href="model_doc/mbart">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-mbart-blueviolet">
-   </a>
-
-`Multilingual Denoising Pre-training for Neural Machine Translation <https://arxiv.org/abs/2001.08210>`_ by Yinhan Liu,
-Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-
-The model architecture and pretraining objective is same as BART, but MBart is trained on 25 languages and is intended
-for supervised and unsupervised machine translation. MBart is one of the first methods for pretraining a complete
-sequence-to-sequence model by denoising full texts in multiple languages,
-
-The library provides a version of this model for conditional generation.
-
-The `mbart-large-en-ro checkpoint <https://huggingface.co/facebook/mbart-large-en-ro>`_ can be used for english ->
-romanian translation.
-
-The `mbart-large-cc25 <https://huggingface.co/facebook/mbart-large-cc25>`_ checkpoint can be finetuned for other
-translation and summarization tasks, using code in ```examples/pytorch/translation/``` , but is not very useful without
-finetuning.
-
-
-ProphetNet
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=prophetnet">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-prophetnet-blueviolet">
-   </a>
-   <a href="model_doc/prophetnet">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-prophetnet-blueviolet">
-   </a>
-
-`ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training, <https://arxiv.org/abs/2001.04063>`__ by
-Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang, Ming Zhou.
-
-ProphetNet introduces a novel *sequence-to-sequence* pretraining objective, called *future n-gram prediction*. In
-future n-gram prediction, the model predicts the next n tokens simultaneously based on previous context tokens at each
-time step instead instead of just the single next token. The future n-gram prediction explicitly encourages the model
-to plan for the future tokens and prevent overfitting on strong local correlations. The model architecture is based on
-the original Transformer, but replaces the "standard" self-attention mechanism in the decoder by a a main
-self-attention mechanism and a self and n-stream (predict) self-attention mechanism.
-
-The library provides a pre-trained version of this model for conditional generation and a fine-tuned version for
-summarization.
-
-XLM-ProphetNet
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=xprophetnet">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-xprophetnet-blueviolet">
-   </a>
-   <a href="model_doc/xlmprophetnet">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-xprophetnet-blueviolet">
-   </a>
-
-`ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training, <https://arxiv.org/abs/2001.04063>`__ by
-Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang, Ming Zhou.
-
-XLM-ProphetNet's model architecture and pretraining objective is same as ProphetNet, but XLM-ProphetNet was pre-trained
-on the cross-lingual dataset `XGLUE <https://arxiv.org/abs/2004.01401>`__.
-
-The library provides a pre-trained version of this model for multi-lingual conditional generation and fine-tuned
-versions for headline generation and question generation, respectively.
-
-.. _multimodal-models:
-
-Multimodal models
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-There is one multimodal model in the library which has not been pretrained in the self-supervised fashion like the
-others.
-
-MMBT
------------------------------------------------------------------------------------------------------------------------
-
-`Supervised Multimodal Bitransformers for Classifying Images and Text <https://arxiv.org/abs/1909.02950>`_, Douwe Kiela
-et al.
-
-A transformers model used in multimodal settings, combining a text and an image to make predictions. The transformer
-model takes as inputs the embeddings of the tokenized text and the final activations of a pretrained on images resnet
-(after the pooling layer) that goes through a linear layer (to go from number of features at the end of the resnet to
-the hidden state dimension of the transformer).
-
-The different inputs are concatenated, and on top of the positional embeddings, a segment embedding is added to let the
-model know which part of the input vector corresponds to the text and which to the image.
-
-The pretrained model only works for classification.
-
-..
-    More information in this :doc:`model documentation <model_doc/mmbt>`. TODO: write this page
-
-.. _retrieval-based-models:
-
-Retrieval-based models
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Some models use documents retrieval during (pre)training and inference for open-domain question answering, for example.
-
-
-DPR
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=dpr">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-dpr-blueviolet">
-   </a>
-   <a href="model_doc/dpr">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-dpr-blueviolet">
-   </a>
-
-`Dense Passage Retrieval for Open-Domain Question Answering <https://arxiv.org/abs/2004.04906>`_, Vladimir Karpukhin et
-al.
-
-Dense Passage Retrieval (DPR) - is a set of tools and models for state-of-the-art open-domain question-answering
-research.
-
-
-DPR consists in three models:
-
-  * Question encoder: encode questions as vectors
-  * Context encoder: encode contexts as vectors
-  * Reader: extract the answer of the questions inside retrieved contexts, along with a relevance score (high if the
-    inferred span actually answers the question).
-
-DPR's pipeline (not implemented yet) uses a retrieval step to find the top k contexts given a certain question, and
-then it calls the reader with the question and the retrieved documents to get the answer.
-
-RAG
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-   <a href="https://huggingface.co/models?filter=rag">
-       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-rag-blueviolet">
-   </a>
-   <a href="model_doc/rag">
-       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-rag-blueviolet">
-   </a>
-
-`Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks <https://arxiv.org/abs/2005.11401>`_, Patrick Lewis,
-Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau
-Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela
-
-Retrieval-augmented generation ("RAG") models combine the powers of pretrained dense retrieval (DPR) and Seq2Seq
-models. RAG models retrieve docs, pass them to a seq2seq model, then marginalize to generate outputs. The retriever and
-seq2seq modules are initialized from pretrained models, and fine-tuned jointly, allowing both retrieval and generation
-to adapt to downstream tasks.
-
-The two models RAG-Token and RAG-Sequence are available for generation.
-
-More technical aspects
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Full vs sparse attention
------------------------------------------------------------------------------------------------------------------------
-
-Most transformer models use full attention in the sense that the attention matrix is square. It can be a big
-computational bottleneck when you have long texts. Longformer and reformer are models that try to be more efficient and
-use a sparse version of the attention matrix to speed up training.
-
-.. _lsh-attention:
-
-**LSH attention**
-
-:ref:`Reformer <reformer>` uses LSH attention. In the softmax(QK^t), only the biggest elements (in the softmax
-dimension) of the matrix QK^t are going to give useful contributions. So for each query q in Q, we can consider only
-the keys k in K that are close to q. A hash function is used to determine if q and k are close. The attention mask is
-modified to mask the current token (except at the first position), because it will give a query and a key equal (so
-very similar to each other). Since the hash can be a bit random, several hash functions are used in practice
-(determined by a n_rounds parameter) and then are averaged together.
-
-.. _local-attention:
-
-**Local attention**
-
-:ref:`Longformer <longformer>` uses local attention: often, the local context (e.g., what are the two tokens to the
-left and right?) is enough to take action for a given token. Also, by stacking attention layers that have a small
-window, the last layer will have a receptive field of more than just the tokens in the window, allowing them to build a
-representation of the whole sentence.
-
-Some preselected input tokens are also given global attention: for those few tokens, the attention matrix can access
-all tokens and this process is symmetric: all other tokens have access to those specific tokens (on top of the ones in
-their local window). This is shown in Figure 2d of the paper, see below for a sample attention mask:
-
-.. image:: https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/local_attention_mask.png
-   :scale: 50 %
-   :align: center
-
-Using those attention matrices with less parameters then allows the model to have inputs having a bigger sequence
-length.
-
-Other tricks
------------------------------------------------------------------------------------------------------------------------
-
-.. _axial-pos-encoding:
-
-**Axial positional encodings**
-
-:ref:`Reformer <reformer>` uses axial positional encodings: in traditional transformer models, the positional encoding
-E is a matrix of size :math:`l` by :math:`d`, :math:`l` being the sequence length and :math:`d` the dimension of the
-hidden state. If you have very long texts, this matrix can be huge and take way too much space on the GPU. To alleviate
-that, axial positional encodings consist of factorizing that big matrix E in two smaller matrices E1 and E2, with
-dimensions :math:`l_{1} \times d_{1}` and :math:`l_{2} \times d_{2}`, such that :math:`l_{1} \times l_{2} = l` and
-:math:`d_{1} + d_{2} = d` (with the product for the lengths, this ends up being way smaller). The embedding for time
-step :math:`j` in E is obtained by concatenating the embeddings for timestep :math:`j \% l1` in E1 and :math:`j // l1`
-in E2.
diff --git a/docs/source/parallelism.md b/docs/source/parallelism.mdx
similarity index 100%
rename from docs/source/parallelism.md
rename to docs/source/parallelism.mdx
diff --git a/docs/source/performance.md b/docs/source/performance.mdx
similarity index 100%
rename from docs/source/performance.md
rename to docs/source/performance.mdx
diff --git a/docs/source/philosophy.mdx b/docs/source/philosophy.mdx
new file mode 100644
index 0000000000..13134c31d4
--- /dev/null
+++ b/docs/source/philosophy.mdx
@@ -0,0 +1,80 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Philosophy
+
+🤗 Transformers is an opinionated library built for:
+
+- NLP researchers and educators seeking to use/study/extend large-scale transformers models
+- hands-on practitioners who want to fine-tune those models and/or serve them in production
+- engineers who just want to download a pretrained model and use it to solve a given NLP task.
+
+The library was designed with two strong goals in mind:
+
+- Be as easy and fast to use as possible:
+
+  - We strongly limited the number of user-facing abstractions to learn, in fact, there are almost no abstractions,
+    just three standard classes required to use each model: [configuration](main_classes/configuration),
+    [models](main_classes/model) and [tokenizer](main_classes/tokenizer).
+  - All of these classes can be initialized in a simple and unified way from pretrained instances by using a common
+    `from_pretrained()` instantiation method which will take care of downloading (if needed), caching and
+    loading the related class instance and associated data (configurations' hyper-parameters, tokenizers' vocabulary,
+    and models' weights) from a pretrained checkpoint provided on [Hugging Face Hub](https://huggingface.co/models) or your own saved checkpoint.
+  - On top of those three base classes, the library provides two APIs: [`pipeline`] for quickly
+    using a model (plus its associated tokenizer and configuration) on a given task and
+    [`Trainer`]/`Keras.fit` to quickly train or fine-tune a given model.
+  - As a consequence, this library is NOT a modular toolbox of building blocks for neural nets. If you want to
+    extend/build-upon the library, just use regular Python/PyTorch/TensorFlow/Keras modules and inherit from the base
+    classes of the library to reuse functionalities like model loading/saving.
+
+- Provide state-of-the-art models with performances as close as possible to the original models:
+
+  - We provide at least one example for each architecture which reproduces a result provided by the official authors
+    of said architecture.
+  - The code is usually as close to the original code base as possible which means some PyTorch code may be not as
+    *pytorchic* as it could be as a result of being converted TensorFlow code and vice versa.
+
+A few other goals:
+
+- Expose the models' internals as consistently as possible:
+
+  - We give access, using a single API, to the full hidden-states and attention weights.
+  - Tokenizer and base model's API are standardized to easily switch between models.
+
+- Incorporate a subjective selection of promising tools for fine-tuning/investigating these models:
+
+  - A simple/consistent way to add new tokens to the vocabulary and embeddings for fine-tuning.
+  - Simple ways to mask and prune transformer heads.
+
+- Switch easily between PyTorch and TensorFlow 2.0, allowing training using one framework and inference using another.
+
+## Main concepts
+
+The library is built around three types of classes for each model:
+
+- **Model classes** such as [`BertModel`], which are 30+ PyTorch models ([torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)) or Keras models ([tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model)) that work with the pretrained weights provided in the
+  library.
+- **Configuration classes** such as [`BertConfig`], which store all the parameters required to build
+  a model. You don't always need to instantiate these yourself. In particular, if you are using a pretrained model
+  without any modification, creating the model will automatically take care of instantiating the configuration (which
+  is part of the model).
+- **Tokenizer classes** such as [`BertTokenizer`], which store the vocabulary for each model and
+  provide methods for encoding/decoding strings in a list of token embeddings indices to be fed to a model.
+
+All these classes can be instantiated from pretrained instances and saved locally using two methods:
+
+- `from_pretrained()` lets you instantiate a model/configuration/tokenizer from a pretrained version either
+  provided by the library itself (the supported models can be found on the [Model Hub](https://huggingface.co/models)) or
+  stored locally (or on a server) by the user,
+- `save_pretrained()` lets you save a model/configuration/tokenizer locally so that it can be reloaded using
+  `from_pretrained()`.
+
diff --git a/docs/source/philosophy.rst b/docs/source/philosophy.rst
deleted file mode 100644
index 644ef51c6b..0000000000
--- a/docs/source/philosophy.rst
+++ /dev/null
@@ -1,85 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Philosophy
-=======================================================================================================================
-
-🤗 Transformers is an opinionated library built for:
-
-- NLP researchers and educators seeking to use/study/extend large-scale transformers models
-- hands-on practitioners who want to fine-tune those models and/or serve them in production
-- engineers who just want to download a pretrained model and use it to solve a given NLP task.
-
-The library was designed with two strong goals in mind:
-
-- Be as easy and fast to use as possible:
-
-    - We strongly limited the number of user-facing abstractions to learn, in fact, there are almost no abstractions,
-      just three standard classes required to use each model: :doc:`configuration <main_classes/configuration>`,
-      :doc:`models <main_classes/model>` and :doc:`tokenizer <main_classes/tokenizer>`.
-    - All of these classes can be initialized in a simple and unified way from pretrained instances by using a common
-      :obj:`from_pretrained()` instantiation method which will take care of downloading (if needed), caching and
-      loading the related class instance and associated data (configurations' hyper-parameters, tokenizers' vocabulary,
-      and models' weights) from a pretrained checkpoint provided on `Hugging Face Hub
-      <https://huggingface.co/models>`__ or your own saved checkpoint.
-    - On top of those three base classes, the library provides two APIs: :func:`~transformers.pipeline` for quickly
-      using a model (plus its associated tokenizer and configuration) on a given task and
-      :func:`~transformers.Trainer`/:func:`~transformers.TFTrainer` to quickly train or fine-tune a given model.
-    - As a consequence, this library is NOT a modular toolbox of building blocks for neural nets. If you want to
-      extend/build-upon the library, just use regular Python/PyTorch/TensorFlow/Keras modules and inherit from the base
-      classes of the library to reuse functionalities like model loading/saving.
-
-- Provide state-of-the-art models with performances as close as possible to the original models:
-
-    - We provide at least one example for each architecture which reproduces a result provided by the official authors
-      of said architecture.
-    - The code is usually as close to the original code base as possible which means some PyTorch code may be not as
-      *pytorchic* as it could be as a result of being converted TensorFlow code and vice versa.
-
-A few other goals:
-
-- Expose the models' internals as consistently as possible:
-
-    - We give access, using a single API, to the full hidden-states and attention weights.
-    - Tokenizer and base model's API are standardized to easily switch between models.
-
-- Incorporate a subjective selection of promising tools for fine-tuning/investigating these models:
-
-    - A simple/consistent way to add new tokens to the vocabulary and embeddings for fine-tuning.
-    - Simple ways to mask and prune transformer heads.
-
-- Switch easily between PyTorch and TensorFlow 2.0, allowing training using one framework and inference using another.
-
-Main concepts
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The library is built around three types of classes for each model:
-
-- **Model classes** such as :class:`~transformers.BertModel`, which are 30+ PyTorch models (`torch.nn.Module
-  <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__) or Keras models (`tf.keras.Model
-  <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__) that work with the pretrained weights provided in the
-  library.
-- **Configuration classes** such as :class:`~transformers.BertConfig`, which store all the parameters required to build
-  a model. You don't always need to instantiate these yourself. In particular, if you are using a pretrained model
-  without any modification, creating the model will automatically take care of instantiating the configuration (which
-  is part of the model).
-- **Tokenizer classes** such as :class:`~transformers.BertTokenizer`, which store the vocabulary for each model and
-  provide methods for encoding/decoding strings in a list of token embeddings indices to be fed to a model.
-
-All these classes can be instantiated from pretrained instances and saved locally using two methods:
-
-- :obj:`from_pretrained()` lets you instantiate a model/configuration/tokenizer from a pretrained version either
-  provided by the library itself (the supported models are provided in the list :doc:`here <pretrained_models>`) or
-  stored locally (or on a server) by the user,
-- :obj:`save_pretrained()` lets you save a model/configuration/tokenizer locally so that it can be reloaded using
-  :obj:`from_pretrained()`.
-
diff --git a/docs/source/pr_checks.md b/docs/source/pr_checks.mdx
similarity index 100%
rename from docs/source/pr_checks.md
rename to docs/source/pr_checks.mdx
diff --git a/docs/source/sagemaker.md b/docs/source/sagemaker.mdx
similarity index 100%
rename from docs/source/sagemaker.md
rename to docs/source/sagemaker.mdx
diff --git a/docs/source/serialization.mdx b/docs/source/serialization.mdx
new file mode 100644
index 0000000000..a78bda68c2
--- /dev/null
+++ b/docs/source/serialization.mdx
@@ -0,0 +1,430 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Exporting transformers models
+
+## ONNX / ONNXRuntime
+
+Projects [ONNX (Open Neural Network eXchange)](http://onnx.ai) and [ONNXRuntime (ORT)](https://microsoft.github.io/onnxruntime/) are part of an effort from leading industries in the AI field to provide a
+unified and community-driven format to store and, by extension, efficiently execute neural network leveraging a variety
+of hardware and dedicated optimizations.
+
+
+Starting from transformers v2.10.0 we partnered with ONNX Runtime to provide an easy export of transformers models to
+the ONNX format. You can have a look at the effort by looking at our joint blog post [Accelerate your NLP pipelines
+using Hugging Face Transformers and ONNX Runtime](https://medium.com/microsoftazure/accelerate-your-nlp-pipelines-using-hugging-face-transformers-and-onnx-runtime-2443578f4333).
+
+
+### Configuration-based approach
+
+Transformers v4.9.0 introduces a new package: `transformers.onnx`. This package allows converting checkpoints to an
+ONNX graph by leveraging configuration objects. These configuration objects come ready made for a number of model
+architectures, and are made to be easily extendable to other architectures.
+
+Ready-made configurations include the following models:
+
+<!--This table is automatically generated by make style, do not fill manually!-->
+
+- ALBERT
+- BART
+- BERT
+- CamemBERT
+- DistilBERT
+- GPT Neo
+- LayoutLM
+- Longformer
+- mBART
+- OpenAI GPT-2
+- RoBERTa
+- T5
+- XLM-RoBERTa
+
+This conversion is handled with the PyTorch version of models - it, therefore, requires PyTorch to be installed. If you
+would like to be able to convert from TensorFlow, please let us know by opening an issue.
+
+<Tip>
+
+The models showcased here are close to fully feature complete, but do lack some features that are currently in
+development. Namely, the ability to handle the past key values for decoder models is currently in the works.
+
+</Tip>
+
+#### Converting an ONNX model using the `transformers.onnx` package
+
+The package may be used as a Python module:
+
+```bash
+python -m transformers.onnx --help
+
+usage: Hugging Face ONNX Exporter tool [-h] -m MODEL -f {pytorch} [--features {default}] [--opset OPSET] [--atol ATOL] output
+
+positional arguments:
+  output                Path indicating where to store generated ONNX model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -m MODEL, --model MODEL
+                        Model's name of path on disk to load.
+  --features {default}  Export the model with some additional features.
+  --opset OPSET         ONNX opset version to export the model with (default 12).
+  --atol ATOL           Absolute difference tolerance when validating the model.
+```
+
+Exporting a checkpoint using a ready-made configuration can be done as follows:
+
+```bash
+python -m transformers.onnx --model=bert-base-cased onnx/bert-base-cased/
+```
+
+This exports an ONNX graph of the mentioned checkpoint. Here it is *bert-base-cased*, but it can be any model from the
+hub, or a local path.
+
+It will be exported under `onnx/bert-base-cased`. You should see similar logs:
+
+```bash
+Validating ONNX model...
+        -[✓] ONNX model outputs' name match reference model ({'pooler_output', 'last_hidden_state'}
+- Validating ONNX Model output "last_hidden_state":
+                -[✓] (2, 8, 768) matchs (2, 8, 768)
+                -[✓] all values close (atol: 0.0001)
+- Validating ONNX Model output "pooler_output":
+                -[✓] (2, 768) matchs (2, 768)
+                -[✓] all values close (atol: 0.0001)
+All good, model saved at: onnx/bert-base-cased/model.onnx
+```
+
+This export can now be used in the ONNX inference runtime:
+
+```python
+import onnxruntime as ort
+
+from transformers import BertTokenizerFast
+tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
+
+ort_session = ort.InferenceSession("onnx/bert-base-cased/model.onnx")
+
+inputs = tokenizer("Using BERT in ONNX!", return_tensors="np")
+outputs = ort_session.run(["last_hidden_state", "pooler_output"], dict(inputs))
+```
+
+The outputs used (`["last_hidden_state", "pooler_output"]`) can be obtained by taking a look at the ONNX
+configuration of each model. For example, for BERT:
+
+```python
+from transformers.models.bert import BertOnnxConfig, BertConfig
+
+config = BertConfig()
+onnx_config = BertOnnxConfig(config)
+output_keys = list(onnx_config.outputs.keys())
+```
+
+#### Implementing a custom configuration for an unsupported architecture
+
+Let's take a look at the changes necessary to add a custom configuration for an unsupported architecture. Firstly, we
+will need a custom ONNX configuration object that details the model inputs and outputs. The BERT ONNX configuration is
+visible below:
+
+```python
+class BertOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("input_ids", {0: "batch", 1: "sequence"}),
+                ("attention_mask", {0: "batch", 1: "sequence"}),
+                ("token_type_ids", {0: "batch", 1: "sequence"}),
+            ]
+        )
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict([("last_hidden_state", {0: "batch", 1: "sequence"}), ("pooler_output", {0: "batch"})])
+```
+
+Let's understand what's happening here. This configuration has two properties: the inputs, and the outputs.
+
+The inputs return a dictionary, where each key corresponds to an expected input, and each value indicates the axis of
+that input.
+
+For BERT, there are three necessary inputs. These three inputs are of similar shape, which is made up of two
+dimensions: the batch is the first dimension, and the second is the sequence.
+
+The outputs return a similar dictionary, where, once again, each key corresponds to an expected output, and each value
+indicates the axis of that output.
+
+Once this is done, a single step remains: adding this configuration object to the initialisation of the model class,
+and to the general `transformers` initialisation.
+
+An important fact to notice is the use of *OrderedDict* in both inputs and outputs properties. This is a requirements
+as inputs are matched against their relative position within the *PreTrainedModel.forward()* prototype and outputs are
+match against there position in the returned *BaseModelOutputX* instance.
+
+An example of such an addition is visible here, for the MBart model: [Making MBART ONNX-convertible](https://github.com/huggingface/transformers/pull/13049/commits/d097adcebd89a520f04352eb215a85916934204f)
+
+If you would like to contribute your addition to the library, we recommend you implement tests. An example of such
+tests is visible here: [Adding tests to the MBART ONNX conversion](https://github.com/huggingface/transformers/pull/13049/commits/5d642f65abf45ceeb72bd855ca7bfe2506a58e6a)
+
+### Graph conversion
+
+<Tip>
+
+The approach detailed here is bing deprecated. We recommend you follow the part above for an up to date approach.
+
+</Tip>
+
+Exporting a model is done through the script *convert_graph_to_onnx.py* at the root of the transformers sources. The
+following command shows how easy it is to export a BERT model from the library, simply run:
+
+```bash
+python convert_graph_to_onnx.py --framework <pt, tf> --model bert-base-cased bert-base-cased.onnx
+```
+
+The conversion tool works for both PyTorch and Tensorflow models and ensures:
+
+- The model and its weights are correctly initialized from the Hugging Face model hub or a local checkpoint.
+- The inputs and outputs are correctly generated to their ONNX counterpart.
+- The generated model can be correctly loaded through onnxruntime.
+
+<Tip>
+
+Currently, inputs and outputs are always exported with dynamic sequence axes preventing some optimizations on the
+ONNX Runtime. If you would like to see such support for fixed-length inputs/outputs, please open up an issue on
+transformers.
+
+</Tip>
+
+Also, the conversion tool supports different options which let you tune the behavior of the generated model:
+
+- **Change the target opset version of the generated model.** (More recent opset generally supports more operators and
+  enables faster inference)
+
+- **Export pipeline-specific prediction heads.** (Allow to export model along with its task-specific prediction
+  head(s))
+
+- **Use the external data format (PyTorch only).** (Lets you export model which size is above 2Gb ([More info](https://github.com/pytorch/pytorch/pull/33062)))
+
+
+### Optimizations
+
+ONNXRuntime includes some transformers-specific transformations to leverage optimized operations in the graph. Below
+are some of the operators which can be enabled to speed up inference through ONNXRuntime (*see note below*):
+
+- Constant folding
+- Attention Layer fusing
+- Skip connection LayerNormalization fusing
+- FastGeLU approximation
+
+Some of the optimizations performed by ONNX runtime can be hardware specific and thus lead to different performances if
+used on another machine with a different hardware configuration than the one used for exporting the model. For this
+reason, when using `convert_graph_to_onnx.py` optimizations are not enabled, ensuring the model can be easily
+exported to various hardware. Optimizations can then be enabled when loading the model through ONNX runtime for
+inference.
+
+
+<Tip>
+
+When quantization is enabled (see below), `convert_graph_to_onnx.py` script will enable optimizations on the
+model because quantization would modify the underlying graph making it impossible for ONNX runtime to do the
+optimizations afterwards.
+
+</Tip>
+
+<Tip>
+
+For more information about the optimizations enabled by ONNXRuntime, please have a look at the [ONNXRuntime Github](https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers).
+
+</Tip>
+
+### Quantization
+
+ONNX exporter supports generating a quantized version of the model to allow efficient inference.
+
+Quantization works by converting the memory representation of the parameters in the neural network to a compact integer
+format. By default, weights of a neural network are stored as single-precision float (*float32*) which can express a
+wide-range of floating-point numbers with decent precision. These properties are especially interesting at training
+where you want fine-grained representation.
+
+On the other hand, after the training phase, it has been shown one can greatly reduce the range and the precision of
+*float32* numbers without changing the performances of the neural network.
+
+More technically, *float32* parameters are converted to a type requiring fewer bits to represent each number, thus
+reducing the overall size of the model. Here, we are enabling *float32* mapping to *int8* values (a non-floating,
+single byte, number representation) according to the following formula:
+
+$$y_{float32} = scale * x_{int8} - zero\_point$$
+
+<Tip>
+
+The quantization process will infer the parameter *scale* and *zero_point* from the neural network parameters
+
+</Tip>
+
+Leveraging tiny-integers has numerous advantages when it comes to inference:
+
+- Storing fewer bits instead of 32 bits for the *float32* reduces the size of the model and makes it load faster.
+- Integer operations execute a magnitude faster on modern hardware
+- Integer operations require less power to do the computations
+
+In order to convert a transformers model to ONNX IR with quantized weights you just need to specify `--quantize` when
+using `convert_graph_to_onnx.py`. Also, you can have a look at the `quantize()` utility-method in this same script
+file.
+
+Example of quantized BERT model export:
+
+```bash
+python convert_graph_to_onnx.py --framework <pt, tf> --model bert-base-cased --quantize bert-base-cased.onnx
+```
+
+<Tip>
+
+Quantization support requires ONNX Runtime >= 1.4.0
+
+</Tip>
+
+<Tip>
+
+When exporting quantized model you will end up with two different ONNX files. The one specified at the end of the
+above command will contain the original ONNX model storing *float32* weights. The second one, with `-quantized`
+suffix, will hold the quantized parameters.
+
+</Tip>
+
+## TorchScript
+
+<Tip>
+
+This is the very beginning of our experiments with TorchScript and we are still exploring its capabilities with
+variable-input-size models. It is a focus of interest to us and we will deepen our analysis in upcoming releases,
+with more code examples, a more flexible implementation, and benchmarks comparing python-based codes with compiled
+TorchScript.
+
+</Tip>
+
+According to Pytorch's documentation: "TorchScript is a way to create serializable and optimizable models from PyTorch
+code". Pytorch's two modules [JIT and TRACE](https://pytorch.org/docs/stable/jit.html) allow the developer to export
+their model to be re-used in other programs, such as efficiency-oriented C++ programs.
+
+We have provided an interface that allows the export of 🤗 Transformers models to TorchScript so that they can be reused
+in a different environment than a Pytorch-based python program. Here we explain how to export and use our models using
+TorchScript.
+
+Exporting a model requires two things:
+
+- a forward pass with dummy inputs.
+- model instantiation with the `torchscript` flag.
+
+These necessities imply several things developers should be careful about. These are detailed below.
+
+
+### Implications
+
+### TorchScript flag and tied weights
+
+This flag is necessary because most of the language models in this repository have tied weights between their
+`Embedding` layer and their `Decoding` layer. TorchScript does not allow the export of models that have tied
+weights, therefore it is necessary to untie and clone the weights beforehand.
+
+This implies that models instantiated with the `torchscript` flag have their `Embedding` layer and `Decoding`
+layer separate, which means that they should not be trained down the line. Training would de-synchronize the two
+layers, leading to unexpected results.
+
+This is not the case for models that do not have a Language Model head, as those do not have tied weights. These models
+can be safely exported without the `torchscript` flag.
+
+### Dummy inputs and standard lengths
+
+The dummy inputs are used to do a model forward pass. While the inputs' values are propagating through the layers,
+Pytorch keeps track of the different operations executed on each tensor. These recorded operations are then used to
+create the "trace" of the model.
+
+The trace is created relatively to the inputs' dimensions. It is therefore constrained by the dimensions of the dummy
+input, and will not work for any other sequence length or batch size. When trying with a different size, an error such
+as:
+
+`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2`
+
+will be raised. It is therefore recommended to trace the model with a dummy input size at least as large as the largest
+input that will be fed to the model during inference. Padding can be performed to fill the missing values. As the model
+will have been traced with a large input size however, the dimensions of the different matrix will be large as well,
+resulting in more calculations.
+
+It is recommended to be careful of the total number of operations done on each input and to follow performance closely
+when exporting varying sequence-length models.
+
+### Using TorchScript in Python
+
+Below is an example, showing how to save, load models as well as how to use the trace for inference.
+
+#### Saving a model
+
+This snippet shows how to use TorchScript to export a `BertModel`. Here the `BertModel` is instantiated according
+to a `BertConfig` class and then saved to disk under the filename `traced_bert.pt`
+
+```python
+from transformers import BertModel, BertTokenizer, BertConfig
+import torch
+
+enc = BertTokenizer.from_pretrained("bert-base-uncased")
+
+# Tokenizing input text
+text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+tokenized_text = enc.tokenize(text)
+
+# Masking one of the input tokens
+masked_index = 8
+tokenized_text[masked_index] = '[MASK]'
+indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
+segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
+
+# Creating a dummy input
+tokens_tensor = torch.tensor([indexed_tokens])
+segments_tensors = torch.tensor([segments_ids])
+dummy_input = [tokens_tensor, segments_tensors]
+
+# Initializing the model with the torchscript flag
+# Flag set to True even though it is not necessary as this model does not have an LM Head.
+config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+    num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, torchscript=True)
+
+# Instantiating the model
+model = BertModel(config)
+
+# The model needs to be in evaluation mode
+model.eval()
+
+# If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag
+model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)
+
+# Creating the trace
+traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
+torch.jit.save(traced_model, "traced_bert.pt")
+```
+
+#### Loading a model
+
+This snippet shows how to load the `BertModel` that was previously saved to disk under the name `traced_bert.pt`.
+We are re-using the previously initialised `dummy_input`.
+
+```python
+loaded_model = torch.jit.load("traced_bert.pt")
+loaded_model.eval()
+
+all_encoder_layers, pooled_output = loaded_model(*dummy_input)
+```
+
+#### Using a traced model for inference
+
+Using the traced model for inference is as simple as using its `__call__` dunder method:
+
+```python
+traced_model(tokens_tensor, segments_tensors)
+```
diff --git a/docs/source/serialization.rst b/docs/source/serialization.rst
deleted file mode 100644
index 508eb2ac7a..0000000000
--- a/docs/source/serialization.rst
+++ /dev/null
@@ -1,432 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Exporting transformers models
-***********************************************************************************************************************
-
-ONNX / ONNXRuntime
-=======================================================================================================================
-
-Projects `ONNX (Open Neural Network eXchange) <http://onnx.ai>`_ and `ONNXRuntime (ORT)
-<https://microsoft.github.io/onnxruntime/>`_ are part of an effort from leading industries in the AI field to provide a
-unified and community-driven format to store and, by extension, efficiently execute neural network leveraging a variety
-of hardware and dedicated optimizations.
-
-
-Starting from transformers v2.10.0 we partnered with ONNX Runtime to provide an easy export of transformers models to
-the ONNX format. You can have a look at the effort by looking at our joint blog post `Accelerate your NLP pipelines
-using Hugging Face Transformers and ONNX Runtime
-<https://medium.com/microsoftazure/accelerate-your-nlp-pipelines-using-hugging-face-transformers-and-onnx-runtime-2443578f4333>`_.
-
-
-Configuration-based approach
------------------------------------------------------------------------------------------------------------------------
-
-Transformers v4.9.0 introduces a new package: ``transformers.onnx``. This package allows converting checkpoints to an
-ONNX graph by leveraging configuration objects. These configuration objects come ready made for a number of model
-architectures, and are made to be easily extendable to other architectures.
-
-Ready-made configurations include the following models:
-
-..
-    This table is automatically generated by make style, do not fill manually!
-
-- ALBERT
-- BART
-- BERT
-- CamemBERT
-- DistilBERT
-- GPT Neo
-- LayoutLM
-- Longformer
-- mBART
-- OpenAI GPT-2
-- RoBERTa
-- T5
-- XLM-RoBERTa
-
-This conversion is handled with the PyTorch version of models - it, therefore, requires PyTorch to be installed. If you
-would like to be able to convert from TensorFlow, please let us know by opening an issue.
-
-.. note::
-    The models showcased here are close to fully feature complete, but do lack some features that are currently in
-    development. Namely, the ability to handle the past key values for decoder models is currently in the works.
-
-
-Converting an ONNX model using the ``transformers.onnx`` package
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The package may be used as a Python module:
-
-.. code-block::
-
-    python -m transformers.onnx --help
-
-    usage: Hugging Face ONNX Exporter tool [-h] -m MODEL -f {pytorch} [--features {default}] [--opset OPSET] [--atol ATOL] output
-
-    positional arguments:
-      output                Path indicating where to store generated ONNX model.
-
-    optional arguments:
-      -h, --help            show this help message and exit
-      -m MODEL, --model MODEL
-                            Model's name of path on disk to load.
-      --features {default}  Export the model with some additional features.
-      --opset OPSET         ONNX opset version to export the model with (default 12).
-      --atol ATOL           Absolute difference tolerance when validating the model.
-
-Exporting a checkpoint using a ready-made configuration can be done as follows:
-
-.. code-block::
-
-    python -m transformers.onnx --model=bert-base-cased onnx/bert-base-cased/
-
-This exports an ONNX graph of the mentioned checkpoint. Here it is `bert-base-cased`, but it can be any model from the
-hub, or a local path.
-
-It will be exported under ``onnx/bert-base-cased``. You should see similar logs:
-
-.. code-block::
-
-    Validating ONNX model...
-            -[✓] ONNX model outputs' name match reference model ({'pooler_output', 'last_hidden_state'}
-            - Validating ONNX Model output "last_hidden_state":
-                    -[✓] (2, 8, 768) matchs (2, 8, 768)
-                    -[✓] all values close (atol: 0.0001)
-            - Validating ONNX Model output "pooler_output":
-                    -[✓] (2, 768) matchs (2, 768)
-                    -[✓] all values close (atol: 0.0001)
-    All good, model saved at: onnx/bert-base-cased/model.onnx
-
-This export can now be used in the ONNX inference runtime:
-
-.. code-block::
-
-    import onnxruntime as ort
-
-    from transformers import BertTokenizerFast
-    tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
-
-    ort_session = ort.InferenceSession("onnx/bert-base-cased/model.onnx")
-
-    inputs = tokenizer("Using BERT in ONNX!", return_tensors="np")
-    outputs = ort_session.run(["last_hidden_state", "pooler_output"], dict(inputs))
-
-The outputs used (:obj:`["last_hidden_state", "pooler_output"]`) can be obtained by taking a look at the ONNX
-configuration of each model. For example, for BERT:
-
-.. code-block::
-
-    from transformers.models.bert import BertOnnxConfig, BertConfig
-
-    config = BertConfig()
-    onnx_config = BertOnnxConfig(config)
-    output_keys = list(onnx_config.outputs.keys())
-
-Implementing a custom configuration for an unsupported architecture
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Let's take a look at the changes necessary to add a custom configuration for an unsupported architecture. Firstly, we
-will need a custom ONNX configuration object that details the model inputs and outputs. The BERT ONNX configuration is
-visible below:
-
-.. code-block::
-
-    class BertOnnxConfig(OnnxConfig):
-        @property
-        def inputs(self) -> Mapping[str, Mapping[int, str]]:
-            return OrderedDict(
-                [
-                    ("input_ids", {0: "batch", 1: "sequence"}),
-                    ("attention_mask", {0: "batch", 1: "sequence"}),
-                    ("token_type_ids", {0: "batch", 1: "sequence"}),
-                ]
-            )
-
-        @property
-        def outputs(self) -> Mapping[str, Mapping[int, str]]:
-            return OrderedDict([("last_hidden_state", {0: "batch", 1: "sequence"}), ("pooler_output", {0: "batch"})])
-
-Let's understand what's happening here. This configuration has two properties: the inputs, and the outputs.
-
-The inputs return a dictionary, where each key corresponds to an expected input, and each value indicates the axis of
-that input.
-
-For BERT, there are three necessary inputs. These three inputs are of similar shape, which is made up of two
-dimensions: the batch is the first dimension, and the second is the sequence.
-
-The outputs return a similar dictionary, where, once again, each key corresponds to an expected output, and each value
-indicates the axis of that output.
-
-Once this is done, a single step remains: adding this configuration object to the initialisation of the model class,
-and to the general ``transformers`` initialisation.
-
-An important fact to notice is the use of `OrderedDict` in both inputs and outputs properties. This is a requirements
-as inputs are matched against their relative position within the `PreTrainedModel.forward()` prototype and outputs are
-match against there position in the returned `BaseModelOutputX` instance.
-
-An example of such an addition is visible here, for the MBart model: `Making MBART ONNX-convertible
-<https://github.com/huggingface/transformers/pull/13049/commits/d097adcebd89a520f04352eb215a85916934204f>`__
-
-If you would like to contribute your addition to the library, we recommend you implement tests. An example of such
-tests is visible here: `Adding tests to the MBART ONNX conversion
-<https://github.com/huggingface/transformers/pull/13049/commits/5d642f65abf45ceeb72bd855ca7bfe2506a58e6a>`__
-
-Graph conversion
------------------------------------------------------------------------------------------------------------------------
-
-.. note::
-    The approach detailed here is bing deprecated. We recommend you follow the part above for an up to date approach.
-
-
-Exporting a model is done through the script `convert_graph_to_onnx.py` at the root of the transformers sources. The
-following command shows how easy it is to export a BERT model from the library, simply run:
-
-.. code-block:: bash
-
-    python convert_graph_to_onnx.py --framework <pt, tf> --model bert-base-cased bert-base-cased.onnx
-
-The conversion tool works for both PyTorch and Tensorflow models and ensures:
-
-* The model and its weights are correctly initialized from the Hugging Face model hub or a local checkpoint.
-* The inputs and outputs are correctly generated to their ONNX counterpart.
-* The generated model can be correctly loaded through onnxruntime.
-
-.. note::
-    Currently, inputs and outputs are always exported with dynamic sequence axes preventing some optimizations on the
-    ONNX Runtime. If you would like to see such support for fixed-length inputs/outputs, please open up an issue on
-    transformers.
-
-
-Also, the conversion tool supports different options which let you tune the behavior of the generated model:
-
-* **Change the target opset version of the generated model.** (More recent opset generally supports more operators and
-  enables faster inference)
-
-* **Export pipeline-specific prediction heads.** (Allow to export model along with its task-specific prediction
-  head(s))
-
-* **Use the external data format (PyTorch only).** (Lets you export model which size is above 2Gb (`More info
-  <https://github.com/pytorch/pytorch/pull/33062>`_))
-
-
-Optimizations
------------------------------------------------------------------------------------------------------------------------
-
-ONNXRuntime includes some transformers-specific transformations to leverage optimized operations in the graph. Below
-are some of the operators which can be enabled to speed up inference through ONNXRuntime (*see note below*):
-
-* Constant folding
-* Attention Layer fusing
-* Skip connection LayerNormalization fusing
-* FastGeLU approximation
-
-Some of the optimizations performed by ONNX runtime can be hardware specific and thus lead to different performances if
-used on another machine with a different hardware configuration than the one used for exporting the model. For this
-reason, when using ``convert_graph_to_onnx.py`` optimizations are not enabled, ensuring the model can be easily
-exported to various hardware. Optimizations can then be enabled when loading the model through ONNX runtime for
-inference.
-
-
-.. note::
-    When quantization is enabled (see below), ``convert_graph_to_onnx.py`` script will enable optimizations on the
-    model because quantization would modify the underlying graph making it impossible for ONNX runtime to do the
-    optimizations afterwards.
-
-.. note::
-    For more information about the optimizations enabled by ONNXRuntime, please have a look at the `ONNXRuntime Github
-    <https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers>`_.
-
-Quantization
------------------------------------------------------------------------------------------------------------------------
-
-ONNX exporter supports generating a quantized version of the model to allow efficient inference.
-
-Quantization works by converting the memory representation of the parameters in the neural network to a compact integer
-format. By default, weights of a neural network are stored as single-precision float (`float32`) which can express a
-wide-range of floating-point numbers with decent precision. These properties are especially interesting at training
-where you want fine-grained representation.
-
-On the other hand, after the training phase, it has been shown one can greatly reduce the range and the precision of
-`float32` numbers without changing the performances of the neural network.
-
-More technically, `float32` parameters are converted to a type requiring fewer bits to represent each number, thus
-reducing the overall size of the model. Here, we are enabling `float32` mapping to `int8` values (a non-floating,
-single byte, number representation) according to the following formula:
-
-.. math::
-    y_{float32} = scale * x_{int8} - zero\_point
-
-.. note::
-    The quantization process will infer the parameter `scale` and `zero_point` from the neural network parameters
-
-Leveraging tiny-integers has numerous advantages when it comes to inference:
-
-* Storing fewer bits instead of 32 bits for the `float32` reduces the size of the model and makes it load faster.
-* Integer operations execute a magnitude faster on modern hardware
-* Integer operations require less power to do the computations
-
-In order to convert a transformers model to ONNX IR with quantized weights you just need to specify ``--quantize`` when
-using ``convert_graph_to_onnx.py``. Also, you can have a look at the ``quantize()`` utility-method in this same script
-file.
-
-Example of quantized BERT model export:
-
-.. code-block:: bash
-
-    python convert_graph_to_onnx.py --framework <pt, tf> --model bert-base-cased --quantize bert-base-cased.onnx
-
-.. note::
-    Quantization support requires ONNX Runtime >= 1.4.0
-
-.. note::
-    When exporting quantized model you will end up with two different ONNX files. The one specified at the end of the
-    above command will contain the original ONNX model storing `float32` weights. The second one, with ``-quantized``
-    suffix, will hold the quantized parameters.
-
-
-TorchScript
-=======================================================================================================================
-
-.. note::
-    This is the very beginning of our experiments with TorchScript and we are still exploring its capabilities with
-    variable-input-size models. It is a focus of interest to us and we will deepen our analysis in upcoming releases,
-    with more code examples, a more flexible implementation, and benchmarks comparing python-based codes with compiled
-    TorchScript.
-
-
-According to Pytorch's documentation: "TorchScript is a way to create serializable and optimizable models from PyTorch
-code". Pytorch's two modules `JIT and TRACE <https://pytorch.org/docs/stable/jit.html>`_ allow the developer to export
-their model to be re-used in other programs, such as efficiency-oriented C++ programs.
-
-We have provided an interface that allows the export of 🤗 Transformers models to TorchScript so that they can be reused
-in a different environment than a Pytorch-based python program. Here we explain how to export and use our models using
-TorchScript.
-
-Exporting a model requires two things:
-
-* a forward pass with dummy inputs.
-* model instantiation with the ``torchscript`` flag.
-
-These necessities imply several things developers should be careful about. These are detailed below.
-
-
-Implications
------------------------------------------------------------------------------------------------------------------------
-
-TorchScript flag and tied weights
------------------------------------------------------------------------------------------------------------------------
-
-This flag is necessary because most of the language models in this repository have tied weights between their
-``Embedding`` layer and their ``Decoding`` layer. TorchScript does not allow the export of models that have tied
-weights, therefore it is necessary to untie and clone the weights beforehand.
-
-This implies that models instantiated with the ``torchscript`` flag have their ``Embedding`` layer and ``Decoding``
-layer separate, which means that they should not be trained down the line. Training would de-synchronize the two
-layers, leading to unexpected results.
-
-This is not the case for models that do not have a Language Model head, as those do not have tied weights. These models
-can be safely exported without the ``torchscript`` flag.
-
-Dummy inputs and standard lengths
------------------------------------------------------------------------------------------------------------------------
-
-The dummy inputs are used to do a model forward pass. While the inputs' values are propagating through the layers,
-Pytorch keeps track of the different operations executed on each tensor. These recorded operations are then used to
-create the "trace" of the model.
-
-The trace is created relatively to the inputs' dimensions. It is therefore constrained by the dimensions of the dummy
-input, and will not work for any other sequence length or batch size. When trying with a different size, an error such
-as:
-
-``The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2``
-
-will be raised. It is therefore recommended to trace the model with a dummy input size at least as large as the largest
-input that will be fed to the model during inference. Padding can be performed to fill the missing values. As the model
-will have been traced with a large input size however, the dimensions of the different matrix will be large as well,
-resulting in more calculations.
-
-It is recommended to be careful of the total number of operations done on each input and to follow performance closely
-when exporting varying sequence-length models.
-
-Using TorchScript in Python
------------------------------------------------------------------------------------------------------------------------
-
-Below is an example, showing how to save, load models as well as how to use the trace for inference.
-
-Saving a model
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-This snippet shows how to use TorchScript to export a ``BertModel``. Here the ``BertModel`` is instantiated according
-to a ``BertConfig`` class and then saved to disk under the filename ``traced_bert.pt``
-
-.. code-block:: python
-
-    from transformers import BertModel, BertTokenizer, BertConfig
-    import torch
-
-    enc = BertTokenizer.from_pretrained("bert-base-uncased")
-
-    # Tokenizing input text
-    text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-    tokenized_text = enc.tokenize(text)
-
-    # Masking one of the input tokens
-    masked_index = 8
-    tokenized_text[masked_index] = '[MASK]'
-    indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
-    segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
-
-    # Creating a dummy input
-    tokens_tensor = torch.tensor([indexed_tokens])
-    segments_tensors = torch.tensor([segments_ids])
-    dummy_input = [tokens_tensor, segments_tensors]
-
-    # Initializing the model with the torchscript flag
-    # Flag set to True even though it is not necessary as this model does not have an LM Head.
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, torchscript=True)
-
-    # Instantiating the model
-    model = BertModel(config)
-
-    # The model needs to be in evaluation mode
-    model.eval()
-
-    # If you are instantiating the model with `from_pretrained` you can also easily set the TorchScript flag
-    model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)
-
-    # Creating the trace
-    traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
-    torch.jit.save(traced_model, "traced_bert.pt")
-
-Loading a model
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-This snippet shows how to load the ``BertModel`` that was previously saved to disk under the name ``traced_bert.pt``.
-We are re-using the previously initialised ``dummy_input``.
-
-.. code-block:: python
-
-    loaded_model = torch.jit.load("traced_bert.pt")
-    loaded_model.eval()
-
-    all_encoder_layers, pooled_output = loaded_model(*dummy_input)
-
-Using a traced model for inference
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Using the traced model for inference is as simple as using its ``__call__`` dunder method:
-
-.. code-block:: python
-
-    traced_model(tokens_tensor, segments_tensors)
diff --git a/docs/source/troubleshooting.md b/docs/source/troubleshooting.mdx
similarity index 100%
rename from docs/source/troubleshooting.md
rename to docs/source/troubleshooting.mdx
diff --git a/src/transformers/commands/add_new_model.py b/src/transformers/commands/add_new_model.py
index 1d2ce3f066..4cbdf7a0ef 100644
--- a/src/transformers/commands/add_new_model.py
+++ b/src/transformers/commands/add_new_model.py
@@ -172,8 +172,8 @@ class AddNewModelCommand(BaseTransformersCLICommand):
             os.remove(f"{directory}/test_modeling_flax_{lowercase_model_name}.py")
 
         shutil.move(
-            f"{directory}/{lowercase_model_name}.rst",
-            f"{path_to_transformer_root}/docs/source/model_doc/{lowercase_model_name}.rst",
+            f"{directory}/{lowercase_model_name}.mdx",
+            f"{path_to_transformer_root}/docs/source/model_doc/{lowercase_model_name}.mdx",
         )
 
         shutil.move(
diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
index 15ef465131..16695572c2 100644
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -770,7 +770,7 @@ class MLflowCallback(TrainerCallback):
 
 class NeptuneCallback(TrainerCallback):
     """
-    A [`TrainerCallback`] that sends the logs to *Neptune <https://neptune.ai>*.
+    A [`TrainerCallback`] that sends the logs to [Neptune](https://neptune.ai).
     """
 
     def __init__(self):
diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py
index d9a8f47c2f..4dfa68af8f 100755
--- a/src/transformers/models/beit/modeling_beit.py
+++ b/src/transformers/models/beit/modeling_beit.py
@@ -955,8 +955,8 @@ class BeitPyramidPoolingModule(nn.ModuleList):
 
 class BeitUperHead(nn.Module):
     """
-    Unified Perceptual Parsing for Scene Understanding. This head is the implementation of `UPerNet
-    <https://arxiv.org/abs/1807.10221>`_.
+    Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
+    [UPerNet](https://arxiv.org/abs/1807.10221).
 
     Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
     """
@@ -1040,8 +1040,8 @@ class BeitUperHead(nn.Module):
 
 class BeitFCNHead(nn.Module):
     """
-    Fully Convolution Networks for Semantic Segmentation. This head is implemented of `FCNNet
-    <https://arxiv.org/abs/1411.4038>`_.
+    Fully Convolution Networks for Semantic Segmentation. This head is implemented of
+    [FCNNet](https://arxiv.org/abs/1411.4038>).
 
     Args:
         config (BeitConfig): Configuration.
diff --git a/src/transformers/models/bertweet/tokenization_bertweet.py b/src/transformers/models/bertweet/tokenization_bertweet.py
index dfa5e74699..ee1e3bd7d7 100644
--- a/src/transformers/models/bertweet/tokenization_bertweet.py
+++ b/src/transformers/models/bertweet/tokenization_bertweet.py
@@ -441,7 +441,7 @@ class BertweetTokenizer(PreTrainedTokenizer):
 # Author: Christopher Potts <cgpotts@stanford.edu>
 #         Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
 #         Pierpaolo Pantone <> (modifications)
-# URL: <http://nltk.org/>
+# URL: http://nltk.org/
 # For license information, see LICENSE.TXT
 #
 
diff --git a/src/transformers/models/cpm/tokenization_cpm.py b/src/transformers/models/cpm/tokenization_cpm.py
index 5cd3a72012..92262f23d5 100644
--- a/src/transformers/models/cpm/tokenization_cpm.py
+++ b/src/transformers/models/cpm/tokenization_cpm.py
@@ -33,7 +33,7 @@ class CpmTokenizer(XLNetTokenizer):
 
     def __init__(self, *args, **kwargs):
         """
-        Construct a CPM tokenizer. Based on *Jieba <https://pypi.org/project/jieba/>* and [SentencePiece](https://github.com/google/sentencepiece).
+        Construct a CPM tokenizer. Based on [Jieba](https://pypi.org/project/jieba/) and [SentencePiece](https://github.com/google/sentencepiece).
 
         This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main
         methods. Users should refer to this superclass for more information regarding those methods.
diff --git a/src/transformers/models/cpm/tokenization_cpm_fast.py b/src/transformers/models/cpm/tokenization_cpm_fast.py
index 42a627d88c..f166d6375e 100644
--- a/src/transformers/models/cpm/tokenization_cpm_fast.py
+++ b/src/transformers/models/cpm/tokenization_cpm_fast.py
@@ -36,7 +36,7 @@ class CpmTokenizerFast(XLNetTokenizerFast):
 
     def __init__(self, *args, **kwargs):
         """
-        Construct a CPM tokenizer. Based on *Jieba <https://pypi.org/project/jieba/>* and [SentencePiece](https://github.com/google/sentencepiece).
+        Construct a CPM tokenizer. Based on [Jieba](https://pypi.org/project/jieba/) and [SentencePiece](https://github.com/google/sentencepiece).
 
         This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main
         methods. Users should refer to this superclass for more information regarding those methods.
diff --git a/src/transformers/models/fnet/modeling_fnet.py b/src/transformers/models/fnet/modeling_fnet.py
index 78eaa9b255..e4930cfb8e 100755
--- a/src/transformers/models/fnet/modeling_fnet.py
+++ b/src/transformers/models/fnet/modeling_fnet.py
@@ -518,8 +518,8 @@ FNET_INPUTS_DOCSTRING = r"""
 class FNetModel(FNetPreTrainedModel):
     """
 
-    The model can behave as an encoder, following the architecture described in `FNet: Mixing Tokens with Fourier
-    Transforms <https://arxiv.org/abs/2105.03824>`__ by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago
+    The model can behave as an encoder, following the architecture described in [FNet: Mixing Tokens with Fourier
+    Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago
     Ontanon.
 
     """
diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
index d75ad2a056..444149c696 100755
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -64,8 +64,8 @@ def _compute_mask_indices(
     min_masks: int = 0,
 ) -> np.ndarray:
     """
-    Computes random mask spans for a given shape. Used to implement `SpecAugment: A Simple Data Augmentation Method for
-    ASR <https://arxiv.org/abs/1904.08779>`__. Note that this method is not optimized to run on TPU and should be run
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run
     on CPU as part of the preprocessing during training.
 
     Args:
@@ -923,8 +923,8 @@ class HubertModel(HubertPreTrainedModel):
         attention_mask: Optional[torch.LongTensor] = None,
     ):
         """
-        Masks extracted features along time axis and/or along feature axis according to `SpecAugment
-        <https://arxiv.org/abs/1904.08779>`__ .
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
         """
 
         # `config.apply_spec_augment` can set masking to False
diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py
index ab10009ad9..b3065af59b 100644
--- a/src/transformers/models/hubert/modeling_tf_hubert.py
+++ b/src/transformers/models/hubert/modeling_tf_hubert.py
@@ -212,8 +212,8 @@ def _compute_mask_indices(
         mask_length: size of the mask
         min_masks: minimum number of masked spans
 
-    Adapted from `fairseq's data_utils.py
-    <https://github.com/pytorch/fairseq/blob/e0788f7007a8473a76db573985031f3c94201e79/fairseq/data/data_utils.py#L376>`__.
+    Adapted from [fairseq's
+    data_utils.py](https://github.com/pytorch/fairseq/blob/e0788f7007a8473a76db573985031f3c94201e79/fairseq/data/data_utils.py#L376).
     """
     batch_size, sequence_length = shape
 
@@ -1146,8 +1146,8 @@ class TFHubertMainLayer(tf.keras.layers.Layer):
 
     def _mask_hidden_states(self, hidden_states: tf.Tensor, mask_time_indices: Optional[tf.Tensor] = None):
         """
-        Masks extracted features along time axis and/or along feature axis according to `SpecAugment
-        <https://arxiv.org/abs/1904.08779>`__ .
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
         """
         batch_size, sequence_length, hidden_size = shape_list(hidden_states)
 
diff --git a/src/transformers/models/ibert/modeling_ibert.py b/src/transformers/models/ibert/modeling_ibert.py
index a8eb01bd14..30b70a669d 100644
--- a/src/transformers/models/ibert/modeling_ibert.py
+++ b/src/transformers/models/ibert/modeling_ibert.py
@@ -734,8 +734,8 @@ class IBertModel(IBertPreTrainedModel):
     """
 
     The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
-    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
-    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
     Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
 
     """
diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py
index ac0f8b7359..a5e6df5d59 100644
--- a/src/transformers/models/layoutlm/modeling_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_layoutlm.py
@@ -966,7 +966,7 @@ class LayoutLMForMaskedLM(LayoutLMPreTrainedModel):
 @add_start_docstrings(
     """
     LayoutLM Model with a sequence classification head on top (a linear layer on top of the pooled output) e.g. for
-    document image classification tasks such as the `RVL-CDIP <https://www.cs.cmu.edu/~aharley/rvl-cdip/>`__ dataset.
+    document image classification tasks such as the [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset.
     """,
     LAYOUTLM_START_DOCSTRING,
 )
@@ -1096,8 +1096,8 @@ class LayoutLMForSequenceClassification(LayoutLMPreTrainedModel):
 @add_start_docstrings(
     """
     LayoutLM Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
-    sequence labeling (information extraction) tasks such as the `FUNSD <https://guillaumejaume.github.io/FUNSD/>`__
-    dataset and the `SROIE <https://rrc.cvc.uab.es/?ch=13>`__ dataset.
+    sequence labeling (information extraction) tasks such as the [FUNSD](https://guillaumejaume.github.io/FUNSD/)
+    dataset and the [SROIE](https://rrc.cvc.uab.es/?ch=13) dataset.
     """,
     LAYOUTLM_START_DOCSTRING,
 )
diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
index 3df07a66f5..00198046aa 100755
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@@ -943,8 +943,8 @@ class LayoutLMv2Model(LayoutLMv2PreTrainedModel):
     """
     LayoutLMv2 Model with a sequence classification head on top (a linear layer on top of the concatenation of the
     final hidden state of the [CLS] token, average-pooled initial visual embeddings and average-pooled final visual
-    embeddings, e.g. for document image classification tasks such as the `RVL-CDIP
-    <https://www.cs.cmu.edu/~aharley/rvl-cdip/>`__ dataset.
+    embeddings, e.g. for document image classification tasks such as the
+    [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset.
     """,
     LAYOUTLMV2_START_DOCSTRING,
 )
@@ -1110,9 +1110,9 @@ class LayoutLMv2ForSequenceClassification(LayoutLMv2PreTrainedModel):
 @add_start_docstrings(
     """
     LayoutLMv2 Model with a token classification head on top (a linear layer on top of the text part of the hidden
-    states) e.g. for sequence labeling (information extraction) tasks such as `FUNSD
-    <https://guillaumejaume.github.io/FUNSD/>`__, `SROIE <https://rrc.cvc.uab.es/?ch=13>`__, `CORD
-    <https://github.com/clovaai/cord>`__ and `Kleister-NDA <https://github.com/applicaai/kleister-nda>`__.
+    states) e.g. for sequence labeling (information extraction) tasks such as
+    [FUNSD](https://guillaumejaume.github.io/FUNSD/), [SROIE](https://rrc.cvc.uab.es/?ch=13),
+    [CORD](https://github.com/clovaai/cord) and [Kleister-NDA](https://github.com/applicaai/kleister-nda).
     """,
     LAYOUTLMV2_START_DOCSTRING,
 )
@@ -1226,8 +1226,8 @@ class LayoutLMv2ForTokenClassification(LayoutLMv2PreTrainedModel):
 
 @add_start_docstrings(
     """
-    LayoutLMv2 Model with a span classification head on top for extractive question-answering tasks such as `DocVQA
-    <https://rrc.cvc.uab.es/?ch=17>`__ (a linear layer on top of the text part of the hidden-states output to compute
+    LayoutLMv2 Model with a span classification head on top for extractive question-answering tasks such as
+    [DocVQA](https://rrc.cvc.uab.es/?ch=17) (a linear layer on top of the text part of the hidden-states output to compute
     `span start logits` and `span end logits`).
     """,
     LAYOUTLMV2_START_DOCSTRING,
diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py
index 9af663401d..8960a3deb9 100644
--- a/src/transformers/models/prophetnet/modeling_prophetnet.py
+++ b/src/transformers/models/prophetnet/modeling_prophetnet.py
@@ -55,7 +55,7 @@ PROPHETNET_START_DOCSTRING = r"""
     methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
     pruning heads etc.)
 
-    Original ProphetNet code can be found at <https://github.com/microsoft/ProphetNet> . Checkpoints were converted
+    Original ProphetNet code can be found [here](https://github.com/microsoft/ProphetNet). Checkpoints were converted
     from original Fairseq checkpoints. For more information on the checkpoint conversion, please take a look at the
     file `convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py`.
 
diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py
index b10270d77d..1f3f0d41d4 100644
--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@@ -214,8 +214,8 @@ class RetrievAugLMOutput(ModelOutput):
 
 class RagPreTrainedModel(PreTrainedModel):
     r"""
-    RAG models were released with the paper `Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks
-    <https://arxiv.org/abs/2005.11401>`_ by Patrick Lewis, Ethan Perez, Aleksandra Piktus et al.
+    RAG models were released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP
+    Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandra Piktus et al.
 
     RAG is a retriever augmented model and encapsulate three components: a question encoder, a dataset retriever and a
     generator, the encoder and generator are trainable while the retriever is just an indexed dataset.
diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py
index cf41f74ea5..0b5ce2e103 100644
--- a/src/transformers/models/rag/modeling_tf_rag.py
+++ b/src/transformers/models/rag/modeling_tf_rag.py
@@ -200,8 +200,8 @@ class TFRetrievAugLMOutput(ModelOutput):
 
 class TFRagPreTrainedModel(TFPreTrainedModel):
     r"""
-    RAG models were released with the paper `Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks
-    <https://arxiv.org/abs/2005.11401>`__ by Patrick Lewis, Ethan Perez, Aleksandra Piktus et al.
+    RAG models were released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP
+    Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandra Piktus et al.
 
     RAG is a retriever augmented model and encapsulate three components: a question encoder, a dataset retriever and a
     generator, the encoder and generator are trainable while the retriever is just an indexed dataset.
diff --git a/src/transformers/models/roformer/tokenization_roformer.py b/src/transformers/models/roformer/tokenization_roformer.py
index 3e9e8832fd..0a5f51a453 100644
--- a/src/transformers/models/roformer/tokenization_roformer.py
+++ b/src/transformers/models/roformer/tokenization_roformer.py
@@ -60,7 +60,7 @@ PRETRAINED_INIT_CONFIGURATION = {
 
 class RoFormerTokenizer(PreTrainedTokenizer):
     r"""
-    Construct a RoFormer tokenizer. Based on *Rust Jieba <https://pypi.org/project/rjieba/>*.
+    Construct a RoFormer tokenizer. Based on [Rust Jieba](https://pypi.org/project/rjieba/).
 
     This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py
index dada447525..3f754b8512 100644
--- a/src/transformers/models/sew/modeling_sew.py
+++ b/src/transformers/models/sew/modeling_sew.py
@@ -62,8 +62,8 @@ def _compute_mask_indices(
     min_masks: int = 0,
 ) -> np.ndarray:
     """
-    Computes random mask spans for a given shape. Used to implement `SpecAugment: A Simple Data Augmentation Method for
-    ASR <https://arxiv.org/abs/1904.08779>`__. Note that this method is not optimized to run on TPU and should be run
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run
     on CPU as part of the preprocessing during training.
 
     Args:
@@ -820,8 +820,8 @@ class SEWModel(SEWPreTrainedModel):
         attention_mask: Optional[torch.LongTensor] = None,
     ):
         """
-        Masks extracted features along time axis and/or along feature axis according to `SpecAugment
-        <https://arxiv.org/abs/1904.08779>`__ .
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
         """
 
         # `config.apply_spec_augment` can set masking to False
diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py
index f6bb3d0b1c..5ef8c1572b 100644
--- a/src/transformers/models/sew_d/modeling_sew_d.py
+++ b/src/transformers/models/sew_d/modeling_sew_d.py
@@ -68,8 +68,8 @@ def _compute_mask_indices(
     min_masks: int = 0,
 ) -> np.ndarray:
     """
-    Computes random mask spans for a given shape. Used to implement `SpecAugment: A Simple Data Augmentation Method for
-    ASR <https://arxiv.org/abs/1904.08779>`__. Note that this method is not optimized to run on TPU and should be run
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run
     on CPU as part of the preprocessing during training.
 
     Args:
@@ -1352,8 +1352,8 @@ class SEWDModel(SEWDPreTrainedModel):
         attention_mask: Optional[torch.LongTensor] = None,
     ):
         """
-        Masks extracted features along time axis and/or along feature axis according to `SpecAugment
-        <https://arxiv.org/abs/1904.08779>`__ .
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
         """
 
         # `config.apply_spec_augment` can set masking to False
diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py
index 5565e4c4b3..7abb71419d 100755
--- a/src/transformers/models/splinter/modeling_splinter.py
+++ b/src/transformers/models/splinter/modeling_splinter.py
@@ -605,8 +605,8 @@ SPLINTER_INPUTS_DOCSTRING = r"""
 )
 class SplinterModel(SplinterPreTrainedModel):
     """
-    The model is an encoder (with only self-attention) following the architecture described in `Attention is all you
-    need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion
+    The model is an encoder (with only self-attention) following the architecture described in [Attention is all you
+    need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion
     Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
     """
 
diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py
index fbfd0612da..0e112019ec 100755
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -129,8 +129,8 @@ def _compute_mask_indices(
     min_masks: int = 0,
 ) -> np.ndarray:
     """
-    Computes random mask spans for a given shape. Used to implement `SpecAugment: A Simple Data Augmentation Method for
-    ASR <https://arxiv.org/abs/1904.08779>`__. Note that this method is not optimized to run on TPU and should be run
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run
     on CPU as part of the preprocessing during training.
 
     Args:
@@ -836,8 +836,8 @@ class UniSpeechEncoderStableLayerNorm(nn.Module):
 
 class UniSpeechGumbelVectorQuantizer(nn.Module):
     """
-    Vector quantization using gumbel softmax. See `CATEGORICAL REPARAMETERIZATION WITH GUMBEL-SOFTMAX
-    <https://arxiv.org/pdf/1611.01144.pdf>`__ for more information.
+    Vector quantization using gumbel softmax. See [CATEGORICAL REPARAMETERIZATION WITH
+    GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
     """
 
     def __init__(self, config):
@@ -1068,8 +1068,8 @@ class UniSpeechModel(UniSpeechPreTrainedModel):
         attention_mask: Optional[torch.LongTensor] = None,
     ):
         """
-        Masks extracted features along time axis and/or along feature axis according to `SpecAugment
-        <https://arxiv.org/abs/1904.08779>`__ .
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
         """
 
         # `config.apply_spec_augment` can set masking to False
diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
index bc81a65a9b..9cdba19745 100755
--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -163,8 +163,8 @@ def _compute_mask_indices(
     min_masks: int = 0,
 ) -> np.ndarray:
     """
-    Computes random mask spans for a given shape. Used to implement `SpecAugment: A Simple Data Augmentation Method for
-    ASR <https://arxiv.org/abs/1904.08779>`__. Note that this method is not optimized to run on TPU and should be run
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run
     on CPU as part of the preprocessing during training.
 
     Args:
@@ -870,8 +870,8 @@ class UniSpeechSatEncoderStableLayerNorm(nn.Module):
 
 class UniSpeechSatGumbelVectorQuantizer(nn.Module):
     """
-    Vector quantization using gumbel softmax. See `CATEGORICAL REPARAMETERIZATION WITH GUMBEL-SOFTMAX
-    <https://arxiv.org/pdf/1611.01144.pdf>`__ for more information.
+    Vector quantization using gumbel softmax. See [CATEGORICAL REPARAMETERIZATION WITH
+    GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
     """
 
     def __init__(self, config):
@@ -1101,8 +1101,8 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel):
         attention_mask: Optional[torch.LongTensor] = None,
     ):
         """
-        Masks extracted features along time axis and/or along feature axis according to `SpecAugment
-        <https://arxiv.org/abs/1904.08779>`__ .
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
         """
 
         # `config.apply_spec_augment` can set masking to False
diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py
index 22495b6d7b..7a5719e299 100755
--- a/src/transformers/models/visual_bert/modeling_visual_bert.py
+++ b/src/transformers/models/visual_bert/modeling_visual_bert.py
@@ -679,8 +679,8 @@ VISUAL_BERT_INPUTS_DOCSTRING = r"""
 class VisualBertModel(VisualBertPreTrainedModel):
     """
 
-    The model can behave as an encoder (with only self-attention) following the architecture described in `Attention is
-    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    The model can behave as an encoder (with only self-attention) following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
     Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
     """
 
diff --git a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
index 3c24e4b161..8344d43352 100644
--- a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
@@ -114,8 +114,8 @@ def _compute_mask_indices(
     min_masks: int = 0,
 ) -> np.ndarray:
     """
-    Computes random mask spans for a given shape. Used to implement `SpecAugment: A Simple Data Augmentation Method for
-    ASR <https://arxiv.org/abs/1904.08779>`__. Note that this method is not optimized to run on TPU and should be run
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run
     on CPU as part of the preprocessing during training.
 
     Args:
@@ -685,8 +685,8 @@ class FlaxWav2Vec2StableLayerNormEncoder(nn.Module):
 
 class FlaxWav2Vec2GumbelVectorQuantizer(nn.Module):
     """
-    Vector quantization using gumbel softmax. See `CATEGORICAL REPARAMETERIZATION WITH GUMBEL-SOFTMAX
-    <https://arxiv.org/pdf/1611.01144.pdf>`__ for more information.
+    Vector quantization using gumbel softmax. See [CATEGORICAL REPARAMETERIZATION WITH
+    GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
     """
 
     config: Wav2Vec2Config
diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
index f1de2f14a1..1770667414 100644
--- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
@@ -214,8 +214,8 @@ def _compute_mask_indices(
         mask_length: size of the mask
         min_masks: minimum number of masked spans
 
-    Adapted from `fairseq's data_utils.py
-    <https://github.com/pytorch/fairseq/blob/e0788f7007a8473a76db573985031f3c94201e79/fairseq/data/data_utils.py#L376>`__.
+    Adapted from [fairseq's
+    data_utils.py](https://github.com/pytorch/fairseq/blob/e0788f7007a8473a76db573985031f3c94201e79/fairseq/data/data_utils.py#L376).
     """
     batch_size, sequence_length = shape
 
@@ -1137,8 +1137,8 @@ class TFWav2Vec2MainLayer(tf.keras.layers.Layer):
 
     def _mask_hidden_states(self, hidden_states: tf.Tensor, mask_time_indices: Optional[tf.Tensor] = None):
         """
-        Masks extracted features along time axis and/or along feature axis according to `SpecAugment
-        <https://arxiv.org/abs/1904.08779>`__ .
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
         """
         batch_size, sequence_length, hidden_size = shape_list(hidden_states)
 
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index 12da721992..9fcb1b52b2 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -177,8 +177,8 @@ def _compute_mask_indices(
     min_masks: int = 0,
 ) -> np.ndarray:
     """
-    Computes random mask spans for a given shape. Used to implement `SpecAugment: A Simple Data Augmentation Method for
-    ASR <https://arxiv.org/abs/1904.08779>`__. Note that this method is not optimized to run on TPU and should be run
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run
     on CPU as part of the preprocessing during training.
 
     Args:
@@ -907,8 +907,8 @@ class Wav2Vec2EncoderStableLayerNorm(nn.Module):
 
 class Wav2Vec2GumbelVectorQuantizer(nn.Module):
     """
-    Vector quantization using gumbel softmax. See `CATEGORICAL REPARAMETERIZATION WITH GUMBEL-SOFTMAX
-    <https://arxiv.org/pdf/1611.01144.pdf>`__ for more information.
+    Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH
+    GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
     """
 
     def __init__(self, config):
@@ -1212,8 +1212,8 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel):
         attention_mask: Optional[torch.LongTensor] = None,
     ):
         """
-        Masks extracted features along time axis and/or along feature axis according to `SpecAugment
-        <https://arxiv.org/abs/1904.08779>`__ .
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
         """
 
         # `config.apply_spec_augment` can set masking to False
diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py
index 75bf62113f..c0e233431a 100755
--- a/src/transformers/models/wavlm/modeling_wavlm.py
+++ b/src/transformers/models/wavlm/modeling_wavlm.py
@@ -130,8 +130,8 @@ def _compute_mask_indices(
     min_masks: int = 0,
 ) -> np.ndarray:
     """
-    Computes random mask spans for a given shape. Used to implement `SpecAugment: A Simple Data Augmentation Method for
-    ASR <https://arxiv.org/abs/1904.08779>`__. Note that this method is not optimized to run on TPU and should be run
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run
     on CPU as part of the preprocessing during training.
 
     Args:
@@ -866,8 +866,8 @@ class WavLMEncoderStableLayerNorm(nn.Module):
 
 class WavLMGumbelVectorQuantizer(nn.Module):
     """
-    Vector quantization using gumbel softmax. See `CATEGORICAL REPARAMETERIZATION WITH GUMBEL-SOFTMAX
-    <https://arxiv.org/pdf/1611.01144.pdf>`__ for more information.
+    Vector quantization using gumbel softmax. See [CATEGORICAL REPARAMETERIZATION WITH
+    GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
     """
 
     def __init__(self, config):
@@ -1167,8 +1167,8 @@ class WavLMModel(WavLMPreTrainedModel):
         attention_mask: Optional[torch.LongTensor] = None,
     ):
         """
-        Masks extracted features along time axis and/or along feature axis according to `SpecAugment
-        <https://arxiv.org/abs/1904.08779>`__ .
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
         """
 
         # `config.apply_spec_augment` can set masking to False
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.mdx b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.mdx
new file mode 100644
index 0000000000..954db89a62
--- /dev/null
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.mdx
@@ -0,0 +1,238 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# {{cookiecutter.modelname}}
+
+## Overview
+
+The {{cookiecutter.modelname}} model was proposed in [<INSERT PAPER NAME HERE>(<INSERT PAPER LINK HERE>)  by <INSERT AUTHORS HERE>. <INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](<https://huggingface.co/<INSERT YOUR HF USERNAME HERE>)`. The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+## XLNetConfig
+
+[[autodoc]] XLNetConfig
+
+## {{cookiecutter.camelcase_modelname}}Config
+
+[[autodoc]] {{cookiecutter.camelcase_modelname}}Config
+
+
+## {{cookiecutter.camelcase_modelname}}Tokenizer
+
+[[autodoc]] {{cookiecutter.camelcase_modelname}}Tokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+
+## {{cookiecutter.camelcase_modelname}}TokenizerFast
+
+[[autodoc]] {{cookiecutter.camelcase_modelname}}TokenizerFast
+
+
+{% if "PyTorch" in cookiecutter.generate_tensorflow_pytorch_and_flax -%}
+## {{cookiecutter.camelcase_modelname}}Model
+
+[[autodoc]] {{cookiecutter.camelcase_modelname}}Model
+    - forward
+
+{% if cookiecutter.is_encoder_decoder_model == "False" %}
+## {{cookiecutter.camelcase_modelname}}ForCausalLM
+
+[[autodoc]] {{cookiecutter.camelcase_modelname}}ForCausalLM
+    - forward
+
+
+## {{cookiecutter.camelcase_modelname}}ForMaskedLM
+
+[[autodoc]] {{cookiecutter.camelcase_modelname}}ForMaskedLM
+    - forward
+
+
+## {{cookiecutter.camelcase_modelname}}ForSequenceClassification
+
+[[autodoc]] transformers.{{cookiecutter.camelcase_modelname}}ForSequenceClassification
+    - forward
+
+## {{cookiecutter.camelcase_modelname}}ForMultipleChoice
+
+[[autodoc]] transformers.{{cookiecutter.camelcase_modelname}}ForMultipleChoice
+    - forward
+
+
+## {{cookiecutter.camelcase_modelname}}ForTokenClassification
+
+[[autodoc]] transformers.{{cookiecutter.camelcase_modelname}}ForTokenClassification
+    - forward
+
+
+## {{cookiecutter.camelcase_modelname}}ForQuestionAnswering
+
+[[autodoc]] {{cookiecutter.camelcase_modelname}}ForQuestionAnswering
+    - forward
+
+{%- else %}
+## {{cookiecutter.camelcase_modelname}}ForConditionalGeneration
+
+[[autodoc]] {{cookiecutter.camelcase_modelname}}ForConditionalGeneration
+    - forward
+
+
+## {{cookiecutter.camelcase_modelname}}ForSequenceClassification
+
+[[autodoc]] {{cookiecutter.camelcase_modelname}}ForSequenceClassification
+    - forward
+
+
+## {{cookiecutter.camelcase_modelname}}ForQuestionAnswering
+
+[[autodoc]] {{cookiecutter.camelcase_modelname}}ForQuestionAnswering
+    - forward
+
+
+## {{cookiecutter.camelcase_modelname}}ForCausalLM
+
+[[autodoc]] {{cookiecutter.camelcase_modelname}}ForCausalLM
+    - forward
+
+
+{% endif -%}
+{% endif -%}
+{% if "TensorFlow" in cookiecutter.generate_tensorflow_pytorch_and_flax -%}
+
+## TF{{cookiecutter.camelcase_modelname}}Model
+
+[[autodoc]] TF{{cookiecutter.camelcase_modelname}}Model
+    - call
+
+{% if cookiecutter.is_encoder_decoder_model == "False" %}
+## TF{{cookiecutter.camelcase_modelname}}ForMaskedLM
+
+[[autodoc]] TF{{cookiecutter.camelcase_modelname}}ForMaskedLM
+    - call
+
+
+## TF{{cookiecutter.camelcase_modelname}}ForCausalLM[[autodoc]] 
+
+[[autodoc]] TF{{cookiecutter.camelcase_modelname}}ForCausalLM
+    - call
+
+
+## TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification
+
+[[autodoc]] TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification
+    - call
+
+
+## TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice
+
+[[autodoc]] TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice
+    - call
+
+
+## TF{{cookiecutter.camelcase_modelname}}ForTokenClassification
+
+[[autodoc]] TF{{cookiecutter.camelcase_modelname}}ForTokenClassification
+    - call
+
+
+## TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
+
+[[autodoc]] TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
+    - call
+
+
+{%- else %}
+## TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
+
+[[autodoc]] TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
+    - call
+
+
+{% endif -%}
+{% endif -%}
+
+{% if "Flax" in cookiecutter.generate_tensorflow_pytorch_and_flax -%}
+
+## Flax{{cookiecutter.camelcase_modelname}}Model
+
+[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}Model
+    - call
+
+{% if cookiecutter.is_encoder_decoder_model == "False" %}
+## Flax{{cookiecutter.camelcase_modelname}}ForMaskedLM
+
+[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForMaskedLM
+    - call
+
+
+## Flax{{cookiecutter.camelcase_modelname}}ForCausalLM
+
+[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForCausalLM
+    - call
+
+
+## Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification
+
+[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification
+    - call
+
+
+## Flax{{cookiecutter.camelcase_modelname}}ForMultipleChoice
+
+[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForMultipleChoice
+    - call
+
+
+## Flax{{cookiecutter.camelcase_modelname}}ForTokenClassification
+
+[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForTokenClassification
+    - call
+
+
+## Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
+
+[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
+    - call
+
+
+{%- else %}
+## Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification
+
+[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification
+    - call
+
+
+## Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
+
+[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
+    - call
+
+
+## Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
+
+[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
+    - call
+
+
+{% endif -%}
+{% endif -%}
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.rst b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.rst
deleted file mode 100644
index ed9dad6412..0000000000
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.rst
+++ /dev/null
@@ -1,272 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-{{cookiecutter.modelname}}
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The {{cookiecutter.modelname}} model was proposed in `<INSERT PAPER NAME HERE>
-<<INSERT PAPER LINK HERE>>`__  by <INSERT AUTHORS HERE>. <INSERT SHORT SUMMARY HERE>
-
-The abstract from the paper is the following:
-
-*<INSERT PAPER ABSTRACT HERE>*
-
-Tips:
-
-<INSERT TIPS ABOUT MODEL HERE>
-
-This model was contributed by `<INSERT YOUR HF USERNAME HERE> 
-<https://huggingface.co/<INSERT YOUR HF USERNAME HERE>>`__. The original code can be found `here 
-<<INSERT LINK TO GITHUB REPO HERE>>`__.
-
-{{cookiecutter.camelcase_modelname}}Config
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}Config
-    :members:
-
-
-{{cookiecutter.camelcase_modelname}}Tokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}Tokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-{{cookiecutter.camelcase_modelname}}TokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}TokenizerFast
-    :members:
-
-
-{% if "PyTorch" in cookiecutter.generate_tensorflow_pytorch_and_flax -%}
-{{cookiecutter.camelcase_modelname}}Model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}Model
-    :members: forward
-
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
-{{cookiecutter.camelcase_modelname}}ForCausalLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForCausalLM
-    :members: forward
-
-
-{{cookiecutter.camelcase_modelname}}ForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForMaskedLM
-    :members: forward
-
-
-{{cookiecutter.camelcase_modelname}}ForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForSequenceClassification
-    :members: forward
-
-
-{{cookiecutter.camelcase_modelname}}ForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForMultipleChoice
-    :members: forward
-
-
-{{cookiecutter.camelcase_modelname}}ForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForTokenClassification
-    :members: forward
-
-
-{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
-    :members: forward
-
-{%- else %}
-{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
-    :members: forward
-
-
-{{cookiecutter.camelcase_modelname}}ForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForSequenceClassification
-    :members: forward
-
-
-{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
-    :members: forward
-
-
-{{cookiecutter.camelcase_modelname}}ForCausalLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForCausalLM
-    :members: forward
-
-
-{% endif -%}
-{% endif -%}
-{% if "TensorFlow" in cookiecutter.generate_tensorflow_pytorch_and_flax -%}
-
-TF{{cookiecutter.camelcase_modelname}}Model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TF{{cookiecutter.camelcase_modelname}}Model
-    :members: call
-
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
-TF{{cookiecutter.camelcase_modelname}}ForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TF{{cookiecutter.camelcase_modelname}}ForMaskedLM
-    :members: call
-
-
-TF{{cookiecutter.camelcase_modelname}}ForCausalLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TF{{cookiecutter.camelcase_modelname}}ForCausalLM
-    :members: call
-
-
-TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification
-    :members: call
-
-
-TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice
-    :members: call
-
-
-TF{{cookiecutter.camelcase_modelname}}ForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TF{{cookiecutter.camelcase_modelname}}ForTokenClassification
-    :members: call
-
-
-TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
-    :members: call
-
-
-{%- else %}
-TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
-    :members: call
-
-
-{% endif -%}
-{% endif -%}
-
-{% if "Flax" in cookiecutter.generate_tensorflow_pytorch_and_flax -%}
-
-Flax{{cookiecutter.camelcase_modelname}}Model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Flax{{cookiecutter.camelcase_modelname}}Model
-    :members: call
-
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
-Flax{{cookiecutter.camelcase_modelname}}ForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Flax{{cookiecutter.camelcase_modelname}}ForMaskedLM
-    :members: call
-
-
-Flax{{cookiecutter.camelcase_modelname}}ForCausalLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Flax{{cookiecutter.camelcase_modelname}}ForCausalLM
-    :members: call
-
-
-Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification
-    :members: call
-
-
-Flax{{cookiecutter.camelcase_modelname}}ForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Flax{{cookiecutter.camelcase_modelname}}ForMultipleChoice
-    :members: call
-
-
-Flax{{cookiecutter.camelcase_modelname}}ForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Flax{{cookiecutter.camelcase_modelname}}ForTokenClassification
-    :members: call
-
-
-Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
-    :members: call
-
-
-{%- else %}
-Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification
-    :members: call
-
-
-Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
-    :members: call
-
-
-Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
-    :members: call
-
-
-{% endif -%}
-{% endif -%}
diff --git a/utils/check_table.py b/utils/check_table.py
index 2d5508d546..b0f0ed5f9e 100644
--- a/utils/check_table.py
+++ b/utils/check_table.py
@@ -208,15 +208,15 @@ def get_onnx_model_list():
 def check_onnx_model_list(overwrite=False):
     """Check the model list in the serialization.rst is consistent with the state of the lib and maybe `overwrite`."""
     current_list, start_index, end_index, lines = _find_text_in_file(
-        filename=os.path.join(PATH_TO_DOCS, "serialization.rst"),
-        start_prompt="    This table is automatically generated by make style, do not fill manually!",
+        filename=os.path.join(PATH_TO_DOCS, "serialization.mdx"),
+        start_prompt="<!--This table is automatically generated by make style, do not fill manually!-->",
         end_prompt="This conversion is handled with the PyTorch version of models ",
     )
     new_list = get_onnx_model_list()
 
     if current_list != new_list:
         if overwrite:
-            with open(os.path.join(PATH_TO_DOCS, "serialization.rst"), "w", encoding="utf-8", newline="\n") as f:
+            with open(os.path.join(PATH_TO_DOCS, "serialization.mdx"), "w", encoding="utf-8", newline="\n") as f:
                 f.writelines(lines[:start_index] + [new_list] + lines[end_index:])
         else:
             raise ValueError("The list of ONNX-supported models needs an update. Run `make fix-copies` to fix this.")