From 37be3786cf1de9d21233f543c231866e68954998 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Mon, 8 Jun 2020 11:28:19 -0400
Subject: [PATCH] Clean documentation (#4849)

* Clean documentation
---
 docs/source/model_doc/albert.rst        | 13 ++++++++
 docs/source/model_doc/bart.rst          | 40 ++++++++++++++---------
 docs/source/model_doc/camembert.rst     | 17 ++++++++++
 docs/source/model_doc/ctrl.rst          |  3 ++
 docs/source/model_doc/distilbert.rst    | 26 +++++++++++++++
 docs/source/model_doc/electra.rst       | 17 ++++++++++
 docs/source/model_doc/flaubert.rst      | 43 +++++++++++++++++++++++++
 docs/source/model_doc/longformer.rst    | 21 +++++++++---
 docs/source/model_doc/marian.rst        | 24 ++++++++------
 docs/source/model_doc/roberta.rst       | 25 ++++++++++++++
 docs/source/model_doc/t5.rst            | 30 ++++++++---------
 docs/source/model_doc/xlm.rst           | 15 +++++++++
 docs/source/model_doc/xlmroberta.rst    | 17 ++++++++++
 docs/source/model_doc/xlnet.rst         | 28 ++++++++++++----
 src/transformers/modeling_electra.py    |  4 +--
 src/transformers/modeling_longformer.py | 12 +++----
 src/transformers/modeling_roberta.py    |  2 +-
 src/transformers/modeling_xlnet.py      |  2 +-
 18 files changed, 277 insertions(+), 62 deletions(-)

diff --git a/docs/source/model_doc/albert.rst b/docs/source/model_doc/albert.rst
index 057187e3d0..b5c4fbd113 100644
--- a/docs/source/model_doc/albert.rst
+++ b/docs/source/model_doc/albert.rst
@@ -67,6 +67,12 @@ AlbertForSequenceClassification
 .. autoclass:: transformers.AlbertForSequenceClassification
     :members:
 
+AlbertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AlbertForTokenClassification
+    :members:
+
 
 AlbertForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -103,6 +109,13 @@ TFAlbertForMultipleChoice
     :members:
 
 
+TFAlbertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAlbertForTokenClassification
+    :members:
+
+
 TFAlbertForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/model_doc/bart.rst b/docs/source/model_doc/bart.rst
index 7d28287b2c..0c4ccf73fb 100644
--- a/docs/source/model_doc/bart.rst
+++ b/docs/source/model_doc/bart.rst
@@ -4,8 +4,9 @@ Bart
 file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
 @sshleifer
 
-Paper
-~~~~~
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
 The Bart model was `proposed <https://arxiv.org/abs/1910.13461>`_ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer on 29 Oct, 2019.
 According to the abstract,
 
@@ -16,14 +17,26 @@ According to the abstract,
 The Authors' code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/bart>`_
 
 
-Implementation Notes
-~~~~~~~~~~~~~~~~~~~~
+Implementation Notes:
+
 - Bart doesn't use :obj:`token_type_ids` for sequence classification. Use BartTokenizer.encode to get the proper splitting.
 - The forward pass of ``BartModel`` will create decoder inputs (using the helper function ``transformers.modeling_bart._prepare_bart_decoder_inputs``)  if they are not passed. This is different than some other modeling APIs.
 - Model predictions are intended to be identical to the original implementation. This only works, however, if the string you pass to ``fairseq.encode`` starts with a space.
 - ``BartForConditionalGeneration.generate`` should be used for conditional generation tasks like summarization, see the example in that docstrings
 - Models that load the ``"facebook/bart-large-cnn"`` weights will not have a ``mask_token_id``, or be able to perform mask filling tasks.
 
+BartConfig
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartConfig
+    :members:
+
+
+BartTokenizer
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartTokenizer
+    :members:
 
 
 BartModel
@@ -35,6 +48,13 @@ BartModel
 .. autofunction:: transformers.modeling_bart._prepare_bart_decoder_inputs
 
 
+BartForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartForSequenceClassification
+    :members: forward
+
+
 BartForConditionalGeneration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -42,15 +62,3 @@ BartForConditionalGeneration
     :members: generate, forward
 
 
-BartForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BartForSequenceClassification
-    :members: forward
-
-BartConfig
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BartConfig
-    :members:
-
diff --git a/docs/source/model_doc/camembert.rst b/docs/source/model_doc/camembert.rst
index 44e4e8617f..b2f28842db 100644
--- a/docs/source/model_doc/camembert.rst
+++ b/docs/source/model_doc/camembert.rst
@@ -1,6 +1,9 @@
 CamemBERT
 ----------------------------------------------------
 
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
 The CamemBERT model was proposed in `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`__
 by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suárez, Yoann Dupont, Laurent Romary, Éric Villemonte de la
 Clergerie, Djamé Seddah, and Benoît Sagot. It is based on Facebook's RoBERTa model released in 2019. It is a model
@@ -74,6 +77,13 @@ CamembertForTokenClassification
     :members:
 
 
+CamembertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CamembertForQuestionAnswering
+    :members:
+
+
 TFCamembertModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -100,3 +110,10 @@ TFCamembertForTokenClassification
 
 .. autoclass:: transformers.TFCamembertForTokenClassification
     :members:
+
+
+TFCamembertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFCamembertForQuestionAnswering
+    :members:
\ No newline at end of file
diff --git a/docs/source/model_doc/ctrl.rst b/docs/source/model_doc/ctrl.rst
index 459af52bd7..2683320eb3 100644
--- a/docs/source/model_doc/ctrl.rst
+++ b/docs/source/model_doc/ctrl.rst
@@ -1,6 +1,9 @@
 CTRL
 ----------------------------------------------------
 
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
 CTRL model was proposed in `CTRL: A Conditional Transformer Language Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`_
 by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
diff --git a/docs/source/model_doc/distilbert.rst b/docs/source/model_doc/distilbert.rst
index 9eb9fa151d..f5b3727f5d 100644
--- a/docs/source/model_doc/distilbert.rst
+++ b/docs/source/model_doc/distilbert.rst
@@ -1,6 +1,9 @@
 DistilBERT
 ----------------------------------------------------
 
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
 The DistilBERT model was proposed in the blog post
 `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`__,
 and the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__.
@@ -72,6 +75,13 @@ DistilBertForSequenceClassification
     :members:
 
 
+DistilBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DistilBertForTokenClassification
+    :members:
+
+
 DistilBertForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -99,6 +109,22 @@ TFDistilBertForSequenceClassification
     :members:
 
 
+
+TFDistilBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDistilBertForMultipleChoice
+    :members:
+
+
+
+TFDistilBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDistilBertForTokenClassification
+    :members:
+
+
 TFDistilBertForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/model_doc/electra.rst b/docs/source/model_doc/electra.rst
index 3dbac2cee2..98a21ab25e 100644
--- a/docs/source/model_doc/electra.rst
+++ b/docs/source/model_doc/electra.rst
@@ -1,6 +1,9 @@
 ELECTRA
 ----------------------------------------------------
 
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
 The ELECTRA model was proposed in the paper.
 `ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators <https://openreview.net/pdf?id=r1xMH1BtvB>`__.
 ELECTRA is a new pre-training approach which trains two transformer models: the generator and the discriminator. The
@@ -89,6 +92,13 @@ ElectraForMaskedLM
     :members:
 
 
+ElectraForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraForSequenceClassification
+    :members:
+
+
 ElectraForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -122,3 +132,10 @@ TFElectraForTokenClassification
 
 .. autoclass:: transformers.TFElectraForTokenClassification
     :members:
+
+
+TFElectraForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraForQuestionAnswering
+    :members:
\ No newline at end of file
diff --git a/docs/source/model_doc/flaubert.rst b/docs/source/model_doc/flaubert.rst
index c4c2aa4a29..c542fc0507 100644
--- a/docs/source/model_doc/flaubert.rst
+++ b/docs/source/model_doc/flaubert.rst
@@ -1,6 +1,9 @@
 FlauBERT
 ----------------------------------------------------
 
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
 The FlauBERT model was proposed in the paper
 `FlauBERT: Unsupervised Language Model Pre-training for French <https://arxiv.org/abs/1912.05372>`__ by Hang Le et al.
 It's a transformer pre-trained using a masked language modeling (MLM) objective (BERT-like).
@@ -72,3 +75,43 @@ FlaubertForQuestionAnswering
     :members:
 
 
+TFFlaubertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFlaubertModel
+    :members:
+
+
+TFFlaubertWithLMHeadModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFlaubertWithLMHeadModel
+    :members:
+
+
+TFFlaubertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFlaubertForSequenceClassification
+    :members:
+
+
+TFFlaubertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFlaubertForMultipleChoice
+    :members:
+
+
+TFFlaubertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFlaubertForTokenClassification
+    :members:
+
+
+TFFlaubertForQuestionAnsweringSimple
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFlaubertForQuestionAnsweringSimple
+    :members:
\ No newline at end of file
diff --git a/docs/source/model_doc/longformer.rst b/docs/source/model_doc/longformer.rst
index b25bab001c..badfb4c091 100644
--- a/docs/source/model_doc/longformer.rst
+++ b/docs/source/model_doc/longformer.rst
@@ -4,7 +4,7 @@ Longformer
 file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`_
 
 Overview
-~~~~~
+~~~~~~~~~
 The Longformer model was presented in `Longformer: The Long-Document Transformer <https://arxiv.org/pdf/2004.05150.pdf>`_ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 Here the abstract: 
 
@@ -13,7 +13,7 @@ Here the abstract:
 The Authors' code can be found `here <https://github.com/allenai/longformer>`_ .
 
 Longformer Self Attention
-~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~
 Longformer self attention employs self attention on both a "local" context and a "global" context.
 Most tokens only attend "locally" to each other meaning that each token attends to its :math:`\frac{1}{2} w` previous tokens and :math:`\frac{1}{2} w` succeding tokens with :math:`w` being the window length as defined in `config.attention_window`. Note that `config.attention_window` can be of type ``list`` to define a different :math:`w` for each layer. 
 A selecetd few tokens attend "globally" to all other tokens, as it is conventionally done for all tokens in *e.g.* `BertSelfAttention`.
@@ -55,6 +55,13 @@ LongformerTokenizer
     :members: 
 
 
+LongformerTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerTokenizerFast
+    :members: 
+
+
 LongformerModel
 ~~~~~~~~~~~~~~~~~~~~
 
@@ -69,10 +76,10 @@ LongformerForMaskedLM
     :members:
 
 
-LongformerForQuestionAnswering
+LongformerForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.LongformerForQuestionAnswering
+.. autoclass:: transformers.LongformerForSequenceClassification
     :members:
 
 
@@ -89,3 +96,9 @@ LongformerForTokenClassification
 .. autoclass:: transformers.LongformerForTokenClassification
     :members:
 
+
+LongformerForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerForQuestionAnswering
+    :members:
diff --git a/docs/source/model_doc/marian.rst b/docs/source/model_doc/marian.rst
index ef72e93e13..c4e64d61ee 100644
--- a/docs/source/model_doc/marian.rst
+++ b/docs/source/model_doc/marian.rst
@@ -6,11 +6,11 @@ file a `Github Issue <https://github.com/huggingface/transformers/issues/new?ass
 
 Implementation Notes
 ~~~~~~~~~~~~~~~~~~~~
-- each model is about 298 MB on disk, there are 1,000+ models.
+- Each model is about 298 MB on disk, there are 1,000+ models.
 - The list of supported language pairs can be found `here <https://huggingface.co/Helsinki-NLP>`__.
 - The 1,000+ models were originally trained by `Jörg Tiedemann <https://researchportal.helsinki.fi/en/persons/j%C3%B6rg-tiedemann>`__ using the `Marian <https://marian-nmt.github.io/>`_ C++ library, which supports fast training and translation.
 - All models are transformer encoder-decoders with 6 layers in each component. Each model's performance is documented in a model card.
-- the 80 opus models that require BPE preprocessing are not supported.
+- The 80 opus models that require BPE preprocessing are not supported.
 - The modeling code is the same as ``BartForConditionalGeneration`` with a few minor modifications:
     - static (sinusoid) positional embeddings (``MarianConfig.static_position_embeddings=True``)
     - a new final_logits_bias (``MarianConfig.add_bias_logits=True``)
@@ -86,6 +86,19 @@ Code to see available pretrained models:
     suffix = [x.split('/')[1] for x in model_ids]
     multi_models = [f'{org}/{s}' for s in suffix if s != s.lower()]
 
+MarianConfig
+~~~~~~~~~~~~~~~~~~~
+.. autoclass:: transformers.MarianConfig
+    :members:
+
+
+MarianTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MarianTokenizer
+    :members: prepare_translation_batch
+
+
 MarianMTModel
 ~~~~~~~~~~~~~
 
@@ -96,10 +109,3 @@ This class inherits all functionality from ``BartForConditionalGeneration``, see
 
 .. autoclass:: transformers.MarianMTModel
     :members:
-
-
-MarianTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MarianTokenizer
-    :members: prepare_translation_batch
diff --git a/docs/source/model_doc/roberta.rst b/docs/source/model_doc/roberta.rst
index 31b3999816..fae0d91a29 100644
--- a/docs/source/model_doc/roberta.rst
+++ b/docs/source/model_doc/roberta.rst
@@ -1,6 +1,9 @@
 RoBERTa
 ----------------------------------------------------
 
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
 The RoBERTa model was proposed in `RoBERTa: A Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_
 by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer,
 Veselin Stoyanov. It is based on Google's BERT model released in 2018.
@@ -87,6 +90,14 @@ RobertaForTokenClassification
 .. autoclass:: transformers.RobertaForTokenClassification
     :members:
 
+
+RobertaForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RobertaForQuestionAnswering
+    :members:
+
+
 TFRobertaModel
 ~~~~~~~~~~~~~~~~~~~~
 
@@ -108,8 +119,22 @@ TFRobertaForSequenceClassification
     :members:
 
 
+TFRobertaForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRobertaForMultipleChoice
+    :members:
+
+
 TFRobertaForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.TFRobertaForTokenClassification
     :members:
+
+
+TFRobertaForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRobertaForQuestionAnswering
+    :members:
diff --git a/docs/source/model_doc/t5.rst b/docs/source/model_doc/t5.rst
index 38069801fd..11c16287a5 100644
--- a/docs/source/model_doc/t5.rst
+++ b/docs/source/model_doc/t5.rst
@@ -4,7 +4,8 @@ T5
 file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`_
 
 Overview
-~~~~~
+~~~~~~~~~~~~~~~~~~~~~
+
 The T5 model was presented in `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer <https://arxiv.org/pdf/1910.10683.pdf>`_ by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu in 
 Here the abstract: 
 
@@ -14,10 +15,20 @@ Our systematic study compares pre-training objectives, architectures, unlabeled
 By combining the insights from our exploration with scale and our new "Colossal Clean Crawled Corpus", we achieve state-of-the-art results on many benchmarks covering summarization, question answering, text classification, and more. 
 To facilitate future work on transfer learning for NLP, we release our dataset, pre-trained models, and code.*
 
-The Authors' code can be found `here <https://github.com/google-research/text-to-text-transfer-transformer>`_ .
+Tips:
+
+- T5 is an encoder-decoder model pre-trained on a multi-task mixture of unsupervised 
+  and supervised tasks and for which each task is converted into a text-to-text format.
+  T5 works well on a variety of tasks out-of-the-box by prepending a different prefix to the input corresponding to each task, e.g.: for translation: *translate English to German: ..., summarize: ...*.
+  For more information about which prefix to use, it is easiest to look into Appendix D of the `paper <https://arxiv.org/pdf/1910.10683.pdf>`_ .
+- For sequence to sequence generation, it is recommended to use ``T5ForConditionalGeneration.generate()``. The method takes care of feeding the encoded input via cross-attention layers to the decoder and auto-regressively generates the decoder output.
+- T5 uses relative scalar embeddings. Encoder input padding can be done on the left and on the right.
+
+The original code can be found `here <https://github.com/google-research/text-to-text-transfer-transformer>`_.
 
 Training
-~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~
+
 T5 is an encoder-decoder model and converts all NLP problems into a text-to-text format. It is trained using teacher forcing.
 This means that for training we always need an input sequence and a target sequence. 
 The input sequence is fed to the model using ``input_ids``. The target sequence is shifted to the right, *i.e.* prepended by a start-sequence token and fed to the decoder using the `decoder_input_ids`. In teacher-forcing style, the target sequence is then appended by the EOS token and corresponds to the ``lm_labels``. The PAD token is hereby used as the start-sequence token.
@@ -50,17 +61,6 @@ T5 can be trained / fine-tuned both in a supervised and unsupervised fashion.
   # the forward function automatically creates the correct decoder_input_ids
   model(input_ids=input_ids, lm_labels=lm_labels)
 
-Tips
-~~~~~~~~~~~~~~~~~~~~
-- T5 is an encoder-decoder model pre-trained on a multi-task mixture of unsupervised 
-  and supervised tasks and for which each task is converted into a text-to-text format.
-  T5 works well on a variety of tasks out-of-the-box by prepending a different prefix to the input corresponding to each task, e.g.: for translation: *translate English to German: ..., summarize: ...*.
-  For more information about which prefix to use, it is easiest to look into Appendix D of the `paper <https://arxiv.org/pdf/1910.10683.pdf>`_ .
-- For sequence to sequence generation, it is recommended to use ``T5ForConditionalGeneration.generate()``. The method takes care of feeding the encoded input via cross-attention layers to the decoder and auto-regressively generates the decoder output.
-- T5 uses relative scalar embeddings. Encoder input padding can be done on the left and on the right.
-
-The original code can be found `here <https://github.com/google-research/text-to-text-transfer-transformer>`_.
-
 
 T5Config
 ~~~~~~~~~~~~~~~~~~~~~
@@ -99,7 +99,7 @@ TFT5Model
 
 
 TFT5ForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.TFT5ForConditionalGeneration
     :members:
diff --git a/docs/source/model_doc/xlm.rst b/docs/source/model_doc/xlm.rst
index 3afb4124c5..b043a1beca 100644
--- a/docs/source/model_doc/xlm.rst
+++ b/docs/source/model_doc/xlm.rst
@@ -102,6 +102,21 @@ TFXLMForSequenceClassification
     :members:
 
 
+TFXLMForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLMForMultipleChoice
+    :members:
+
+
+TFXLMForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLMForTokenClassification
+    :members:
+
+
+
 TFXLMForQuestionAnsweringSimple
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/model_doc/xlmroberta.rst b/docs/source/model_doc/xlmroberta.rst
index 4a9cb98181..0743b9d308 100644
--- a/docs/source/model_doc/xlmroberta.rst
+++ b/docs/source/model_doc/xlmroberta.rst
@@ -1,6 +1,9 @@
 XLM-RoBERTa
 ------------------------------------------
 
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
 The XLM-RoBERTa model was proposed in `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__
 by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán,
 Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on Facebook's RoBERTa model released in 2019.
@@ -102,8 +105,22 @@ TFXLMRobertaForSequenceClassification
     :members:
 
 
+TFXLMRobertaForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLMRobertaForMultipleChoice
+    :members:
+
+
 TFXLMRobertaForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.TFXLMRobertaForTokenClassification
     :members:
+
+
+TFXLMRobertaForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLMRobertaForQuestionAnswering
+    :members:
\ No newline at end of file
diff --git a/docs/source/model_doc/xlnet.rst b/docs/source/model_doc/xlnet.rst
index dff63aa9da..79faab8d59 100644
--- a/docs/source/model_doc/xlnet.rst
+++ b/docs/source/model_doc/xlnet.rst
@@ -71,13 +71,6 @@ XLNetForSequenceClassification
     :members:
 
 
-XLNetForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLNetForTokenClassification
-    :members:
-
-
 XLNetForMultipleChoice
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -85,6 +78,13 @@ XLNetForMultipleChoice
     :members:
 
 
+XLNetForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLNetForTokenClassification
+    :members:
+
+
 XLNetForQuestionAnsweringSimple
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -120,6 +120,20 @@ TFXLNetForSequenceClassification
     :members:
 
 
+TFLNetForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLNetForMultipleChoice
+    :members:
+
+
+TFXLNetForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLNetForTokenClassification
+    :members:
+
+
 TFXLNetForQuestionAnsweringSimple
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/src/transformers/modeling_electra.py b/src/transformers/modeling_electra.py
index fea83559f2..493ebc339c 100644
--- a/src/transformers/modeling_electra.py
+++ b/src/transformers/modeling_electra.py
@@ -406,8 +406,8 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
         from transformers import BertTokenizer, BertForSequenceClassification
         import torch
 
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+        tokenizer = ElectraTokenizer.from_pretrained('bert-base-uncased')
+        model = ElectraForSequenceClassification.from_pretrained('bert-base-uncased')
 
         input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
diff --git a/src/transformers/modeling_longformer.py b/src/transformers/modeling_longformer.py
index dda95bfee5..d61036c05c 100644
--- a/src/transformers/modeling_longformer.py
+++ b/src/transformers/modeling_longformer.py
@@ -499,9 +499,10 @@ LONGFORMER_INPUTS_DOCSTRING = r"""
 class LongformerModel(RobertaModel):
     """
     This class overrides :class:`~transformers.RobertaModel` to provide the ability to process
-    long sequences following the selfattention approach described in `Longformer: the Long-Document Transformer`_by
-    Iz Beltagy, Matthew E. Peters, and Arman Cohan. Longformer selfattention combines a local (sliding window)
-    and global attention to extend to long documents without the O(n^2) increase in memory and compute.
+    long sequences following the selfattention approach described in `Longformer: the Long-Document Transformer
+    <https://arxiv.org/abs/2004.05150>`_ by Iz Beltagy, Matthew E. Peters, and Arman Cohan. Longformer selfattention
+    combines a local (sliding window) and global attention to extend to long documents without the O(n^2) increase in
+    memory and compute.
 
     The selfattention module `LongformerSelfAttention` implemented here supports the combination of local and
     global attention but it lacks support for autoregressive attention and dilated attention. Autoregressive
@@ -509,9 +510,6 @@ class LongformerModel(RobertaModel):
     tasks. Future release will add support for autoregressive attention, but the support for dilated attention
     requires a custom CUDA kernel to be memory and compute efficient.
 
-    .. _`Longformer: the Long-Document Transformer`:
-        https://arxiv.org/abs/2004.05150
-
     """
 
     config_class = LongformerConfig
@@ -1154,7 +1152,7 @@ class LongformerForMultipleChoice(BertPreTrainedModel):
 
     Returns:
         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor`` of shape ``(1,)`, `optional`, returned when :obj:`labels` is provided):
+        loss (:obj:`torch.FloatTensor`` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
             Classification loss.
         classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
             `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
diff --git a/src/transformers/modeling_roberta.py b/src/transformers/modeling_roberta.py
index 7d3176311e..e2b704a26a 100644
--- a/src/transformers/modeling_roberta.py
+++ b/src/transformers/modeling_roberta.py
@@ -407,7 +407,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
 
     Returns:
         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor`` of shape ``(1,)`, `optional`, returned when :obj:`labels` is provided):
+        loss (:obj:`torch.FloatTensor`` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
             Classification loss.
         classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
             `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
diff --git a/src/transformers/modeling_xlnet.py b/src/transformers/modeling_xlnet.py
index 522460a22e..6ffcb627bb 100644
--- a/src/transformers/modeling_xlnet.py
+++ b/src/transformers/modeling_xlnet.py
@@ -1328,7 +1328,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
 
     Returns:
         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor`` of shape ``(1,)`, `optional`, returned when :obj:`labels` is provided):
+        loss (:obj:`torch.FloatTensor`` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
             Classification loss.
         classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
             `num_choices` is the second dimension of the input tensors. (see `input_ids` above).