Refactor Code samples; Test code samples (#5036)

* Refactor code samples * Test docstrings * Style * Tokenization examples * Run rust of tests * First step to testing source docs * Style and BART comment * Test the remainder of the code samples * Style * let to const * Formatting fixes * Ready for merge * Fix fixture + Style * Fix last tests * Update docs/source/quicktour.rst Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Addressing @sgugger's comments + Fix MobileBERT in TF Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2020-06-25 16:46:00 -04:00
parent 315f464b0a
commit 364a5ae1f0
68 changed files with 1962 additions and 2979 deletions
--- a/docs/source/_static/css/code-snippets.css
+++ b/docs/source/_static/css/code-snippets.css
@@ -10,3 +10,7 @@
 .highlight .kn, .highlight .nv, .highlight .s2, .highlight .ow {
    color: #6670FF;
 }
+
+.highlight .gp {
+    color: #FB8D68;
+}
--- a/docs/source/_static/css/huggingface.css
+++ b/docs/source/_static/css/huggingface.css
@@ -44,6 +44,7 @@
    display: flex;
    flex-direction: row;
    justify-content: flex-end;
+    margin-right: 30px;
 }

 .framework-selector > button {
@@ -60,6 +61,12 @@
    padding: 5px;
 }

+/* Copy button */
+
+a.copybtn {
+    margin: 3px;
+}
+
 /* The literal code blocks */
 .rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
    color: #6670FF;
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@@ -157,6 +157,8 @@ function platformToggle() {
    const codeBlocks = Array.from(document.getElementsByClassName("highlight"));
    const pytorchIdentifier = "## PYTORCH CODE";
    const tensorflowIdentifier = "## TENSORFLOW CODE";
+
+    const promptSpanIdentifier = `<span class="gp">&gt;&gt;&gt; </span>`
    const pytorchSpanIdentifier = `<span class="c1">${pytorchIdentifier}</span>`;
    const tensorflowSpanIdentifier = `<span class="c1">${tensorflowIdentifier}</span>`;

@@ -169,10 +171,22 @@ function platformToggle() {
        let tensorflowSpans;

        if(pytorchSpanPosition < tensorflowSpanPosition){
-            pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, tensorflowSpanPosition);
+            const isPrompt = spans.slice(
+                spans.indexOf(tensorflowSpanIdentifier) - promptSpanIdentifier.length,
+                spans.indexOf(tensorflowSpanIdentifier)
+            ) == promptSpanIdentifier;
+            const finalTensorflowSpanPosition = isPrompt ? tensorflowSpanPosition - promptSpanIdentifier.length : tensorflowSpanPosition;
+
+            pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, finalTensorflowSpanPosition);
            tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, spans.length);
        }else{
-            tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, pytorchSpanPosition);
+            const isPrompt = spans.slice(
+                spans.indexOf(pytorchSpanIdentifier) - promptSpanIdentifier.length,
+                spans.indexOf(pytorchSpanIdentifier)
+            ) == promptSpanIdentifier;
+            const finalPytorchSpanPosition = isPrompt ? pytorchSpanPosition - promptSpanIdentifier.length : pytorchSpanPosition;
+
+            tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, finalPytorchSpanPosition);
            pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, spans.length);
        }

--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -44,7 +44,8 @@ extensions = [
    'sphinx.ext.napoleon',
    'recommonmark',
    'sphinx.ext.viewcode',
-    'sphinx_markdown_tables'
+    'sphinx_markdown_tables',
+    'sphinx_copybutton'
 ]

 # Add any paths that contain templates here, relative to this directory.
@@ -74,6 +75,8 @@ exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store']
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = None

+# Remove the prompt when copying examples
+copybutton_prompt_text = ">>> "

 # -- Options for HTML output -------------------------------------------------

--- a/docs/source/glossary.rst
+++ b/docs/source/glossary.rst
@@ -45,17 +45,16 @@ tokenizer, which is a `WordPiece <https://arxiv.org/pdf/1609.08144.pdf>`__ token

 ::

-    from transformers import BertTokenizer
-    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+    >>> from transformers import BertTokenizer
+    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

-    sequence = "A Titan RTX has 24GB of VRAM"
+    >>> sequence = "A Titan RTX has 24GB of VRAM"

 The tokenizer takes care of splitting the sequence into tokens available in the tokenizer vocabulary.

 ::

-    tokenized_sequence = tokenizer.tokenize(sequence)
-    print(tokenized_sequence)
+    >>> tokenized_sequence = tokenizer.tokenize(sequence)

 The tokens are either words or subwords. Here for instance, "VRAM" wasn't in the model vocabulary, so it's been split
 in "V", "RA" and "M". To indicate those tokens are not separate words but parts of the same word, a double-dash is
@@ -63,6 +62,7 @@ added for "RA" and "M":

 ::

+    >>> print(tokenized_sequence)
    ['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']

 These tokens can then be converted into IDs which are understandable by the model. This can be done by directly feeding
@@ -71,14 +71,14 @@ the sentence to the tokenizer, which leverages the Rust implementation of

 ::

-    encoded_sequence = tokenizer(sequence)["input_ids"]
-    print(encoded_sequence)
+    >>> encoded_sequence = tokenizer(sequence)["input_ids"]

 The tokenizer returns a dictionary with all the arguments necessary for its corresponding model to work properly. The
 token indices are under the key "input_ids":

 ::

+    >>> print(encoded_sequence)
    [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]

 Note that the tokenizer automatically adds "special tokens" (if the associated model rely on them) which are special
@@ -86,13 +86,14 @@ IDs the model sometimes uses. If we decode the previous sequence of ids,

 ::

-    tokenizer.decode(encoded_sequence)
+    >>> decoded_sequence = tokenizer.decode(encoded_sequence)

 we will see 

 ::

-    '[CLS] A Titan RTX has 24GB of VRAM [SEP]'
+    >>> print(decoded_sequence)
+    [CLS] A Titan RTX has 24GB of VRAM [SEP]

 because this is the way a :class:`~transformers.BertModel` is going to expect its inputs.

@@ -108,21 +109,20 @@ For example, consider these two sequences:

 ::

-    from transformers import BertTokenizer
-    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+    >>> from transformers import BertTokenizer
+    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

-    sequence_a = "This is a short sequence."
-    sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
+    >>> sequence_a = "This is a short sequence."
+    >>> sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."

-    encoded_sequence_a = tokenizer(sequence_a)["input_ids"]
-    encoded_sequence_b = tokenizer(sequence_b)["input_ids"]
-    
-    len(encoded_sequence_a), len(encoded_sequence_b)
+    >>> encoded_sequence_a = tokenizer(sequence_a)["input_ids"]
+    >>> encoded_sequence_b = tokenizer(sequence_b)["input_ids"]

 The encoded versions have different lengths:

 ::

+    >>> len(encoded_sequence_a), len(encoded_sequence_b)
    (8, 19)

 Therefore, we can't be put then together in a same tensor as-is. The first sequence needs to be padded up to the length
@@ -133,15 +133,14 @@ it to pad like this:

 ::

-    padded_sequences = tokenizer([sequence_a, sequence_b], padding=True)
-    padded_sequences["input_ids"]
+    >>> padded_sequences = tokenizer([sequence_a, sequence_b], padding=True)

 We can see that 0s have been added on the right of the first sentence to make it the same length as the second one:

 ::

-    [[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-     [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]]
+    >>> padded_sequences["input_ids"]
+    [[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]]

 This can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating
 the position of the padded indices so that the model does not attend to them. For the
@@ -150,14 +149,8 @@ a padded value. This attention mask is in the dictionary returned by the tokeniz

 ::

-    padded_sequences["attention_mask"]
-
-will give back
-
-::
-
-    [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-     [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
+    >>> padded_sequences["attention_mask"]
+    [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]

 .. _token-type-ids:

@@ -170,26 +163,27 @@ tokens. For example, the BERT model builds its two sequence input as such:

 ::

-   # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]
+   >>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]

 We can use our tokenizer to automatically generate such a sentence by passing the two sequences as two arguments (and
 not a list like before) like this:

 ::

-    from transformers import BertTokenizer
-    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
-    sequence_a = "HuggingFace is based in NYC"
-    sequence_b = "Where is HuggingFace based?"
+    >>> from transformers import BertTokenizer
+    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+    >>> sequence_a = "HuggingFace is based in NYC"
+    >>> sequence_b = "Where is HuggingFace based?"

-    encoded_dict = tokenizer(sequence_a, sequence_b)
-    tokenizer.decode(encoded_dict["input_ids"])
+    >>> encoded_dict = tokenizer(sequence_a, sequence_b)
+    >>> decoded = tokenizer.decode(encoded_dict["input_ids"])

 which will return:

 ::

-    "[CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]"
+    >>> print(decoded)
+    [CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]

 This is enough for some models to understand where one sequence ends and where another begins. However, other models
 such as BERT have an additional mechanism, which are the token type IDs (also called segment IDs). They are a binary
@@ -199,12 +193,7 @@ The tokenizer returns in the dictionary under the key "token_type_ids":

 ::

-    encoded_dict['token_type_ids']
-
-will return
-
-::
-
+    >>> encoded_dict['token_type_ids']
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]

 The first sequence, the "context" used for the question, has all its tokens represented by :obj:`0`, whereas the
--- a/docs/source/multilingual.rst
+++ b/docs/source/multilingual.rst
@@ -36,10 +36,11 @@ Here is an example using the ``xlm-clm-enfr-1024`` checkpoint (Causal language m

 .. code-block::

-    import torch
-    from transformers import XLMTokenizer, XLMWithLMHeadModel
+    >>> import torch
+    >>> from transformers import XLMTokenizer, XLMWithLMHeadModel

-    tokenizer = XLMTokenizer.from_pretrained("xlm-clm-1024-enfr")
+    >>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024")
+    >>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024")


 The different languages this model/tokenizer handles, as well as the ids of these languages are visible using the
@@ -47,16 +48,15 @@ The different languages this model/tokenizer handles, as well as the ids of thes

 .. code-block::

-    # Continuation of the previous script
-    print(tokenizer.lang2id)  # {'en': 0, 'fr': 1}
+    >>> print(tokenizer.lang2id)
+    {'en': 0, 'fr': 1}


 These ids should be used when passing a language parameter during a model pass. Let's define our inputs:

 .. code-block::

-    # Continuation of the previous script
-    input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1
+    >>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1


 We should now define the language embedding by using the previously defined language id. We want to create a tensor
@@ -64,20 +64,18 @@ filled with the appropriate language ids, of the same size as input_ids. For eng

 .. code-block::

-    # Continuation of the previous script
-    language_id = tokenizer.lang2id['en']  # 0
-    langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
+    >>> language_id = tokenizer.lang2id['en']  # 0
+    >>> langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])

-    # We reshape it to be of size (batch_size, sequence_length)
-    langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1)
+    >>> # We reshape it to be of size (batch_size, sequence_length)
+    >>> langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1)


 You can then feed it all as input to your model:

 .. code-block::

-    # Continuation of the previous script
-    outputs = model(input_ids, langs=langs)
+    >>> outputs = model(input_ids, langs=langs)


 The example `run_generation.py <https://github.com/huggingface/transformers/blob/master/examples/text-generation/run_generation.py>`__
--- a/docs/source/quicktour.rst
+++ b/docs/source/quicktour.rst
@@ -32,40 +32,33 @@ provides the following tasks out of the box:
 Let's see how this work for sentiment analysis (the other tasks are all covered in the
 :doc:`task summary </task_summary>`):

-::
+.. code-block::

-    from transformers import pipeline
-    classifier = pipeline('sentiment-analysis')
+    >>> from transformers import pipeline
+    >>> classifier = pipeline('sentiment-analysis')

 When typing this command for the first time, a pretrained model and its tokenizer are downloaded and cached. We will
 look at both later on, but as an introduction the tokenizer's job is to preprocess the text for the model, which is
 then responsible for making predictions. The pipeline groups all of that together, and post-process the predictions to
-make them readable. For instance
+make them readable. For instance:

-::

-    classifier('We are very happy to show you the 🤗 Transformers library.')
-
-will return something like this:
-
-::
+.. code-block::

+    >>> classifier('We are very happy to show you the 🤗 Transformers library.')
    [{'label': 'POSITIVE', 'score': 0.9997795224189758}]

 That's encouraging! You can use it on a list of sentences, which will be preprocessed then fed to the model as a
-`batch`:
+`batch`, returning a list of dictionaries like this one:

-::
+.. code-block::

-    classifier(["We are very happy to show you the 🤗 Transformers library.",
-                "We hope you don't hate it."])
-
-returning a list of dictionaries like this one:
-
-::
-
-    [{'label': 'POSITIVE', 'score': 0.9997795224189758},
-     {'label': 'NEGATIVE', 'score': 0.5308589935302734}]
+    >>> results = classifier(["We are very happy to show you the 🤗 Transformers library.",
+    ...            "We hope you don't hate it."])
+    >>> for result in results:
+    ...     print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
+    label: POSITIVE, with score: 0.9998
+    label: NEGATIVE, with score: 0.5309

 You can see the second sentence has been classified as negative (it needs to be positive or negative) but its score is
 fairly neutral.
@@ -83,9 +76,9 @@ see how we can use it.

 You can directly pass the name of the model to use to :func:`~transformers.pipeline`:

-::
+.. code-block::

-    classifier = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment")
+    >>> classifier = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment")

 This classifier can now deal with texts in English, French, but also Dutch, German, Italian and Spanish! You can also
 replace that name by a local folder where you have saved a pretrained model (see below). You can also pass a model
@@ -98,29 +91,30 @@ tokenizer associated to the model we picked and instantiate it. The second is
 the model itself. Note that if we were using the library on an other task, the class of the model would change. The
 :doc:`task summary </task_summary>` tutorial summarizes which class is used for which task.

-::
+.. code-block::

-    ## PYTORCH CODE
-    from transformers import AutoTokenizer, AutoModelForSequenceClassification
-    ## TENSORFLOW CODE
-    from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

 Now, to download the models and tokenizer we found previously, we just have to use the
 :func:`~transformers.AutoModelForSequenceClassification.from_pretrained` method (feel free to replace ``model_name`` by
 any other model from the model hub):

-::
+.. code-block::

-    ## PYTORCH CODE
-    model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
-    model = AutoModelForSequenceClassification.from_pretrained(model_name)
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    pipe = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
-    ## TENSORFLOW CODE
-    model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
-    model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
+    >>> ## PYTORCH CODE
+    >>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+    >>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+    >>> pipe = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
+    >>> ## TENSORFLOW CODE
+    >>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+    >>> # This model only exists in PyTorch, so we use the `from_pt` flag to import that model in TensorFlow.
+    >>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name, from_pt=True) 
+    >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+    >>> classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

 If you don't find a model that has been pretrained on some data similar to yours, you will need to fine-tune a
 pretrained model on your data. We provide :doc:`example scripts </examples>` to do so. Once you're done, don't forget
@@ -136,16 +130,16 @@ using the :obj:`from_pretrained` method:

 ::

-    ## PYTORCH CODE
-    from transformers import AutoTokenizer, AutoModelForSequenceClassification
-    model_name = "distilbert-base-uncased-finetuned-sst-2-english"
-    model = AutoModelForSequenceClassification.from_pretrained(model_name)
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    ## TENSORFLOW CODE
-    from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
-    model_name = "distilbert-base-uncased-finetuned-sst-2-english"
-    model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+    >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
+    >>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+    >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
+    >>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+    >>> tokenizer = AutoTokenizer.from_pretrained(model_name)

 Using the tokenizer
 ^^^^^^^^^^^^^^^^^^^
@@ -161,48 +155,56 @@ the model. To do this, the tokenizer has a `vocab`, which is the part we downloa

 To apply these steps on a given text, we can just feed it to our tokenizer:

-::
+.. code-block::

-    input = tokenizer("We are very happy to show you the 🤗 Transformers library.")
-    print(input)
+    >>> inputs = tokenizer("We are very happy to show you the 🤗 Transformers library.")

 This returns a dictionary string to list of ints. It contains the `ids of the tokens <glossary.html#input-ids>`__,
 as mentioned before, but also additional arguments that will be useful to the model. Here for instance, we also have an
 `attention mask <glossary.html#attention-mask>`__ that the model will use to have a better understanding of the sequence:


-::
-    {'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102],
-     'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+.. code-block::
+
+    >>> print(inputs)
+    {'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

 You can pass a list of sentences directly to your tokenizer. If your goal is to send them through your model as a
 batch, you probably want to pad them all to the same length, truncate them to the maximum length the model can accept
 and get tensors back. You can specify all of that to the tokenizer:

-::
+.. code-block::

-    ## PYTORCH CODE
-    batch = tokenizer(
-        ["We are very happy to show you the 🤗 Transformers library.",
-         "We hope you don't hate it."],
-        padding=True, truncation=True, return_tensors="pt")
-    print(batch)
-    ## TENSORFLOW CODE
-    batch = tokenizer(
-        ["We are very happy to show you the 🤗 Transformers library.",
-         "We hope you don't hate it."],
-        padding=True, truncation=True, return_tensors="tf")
-    print(batch)
+    >>> ## PYTORCH CODE
+    >>> pt_batch = tokenizer(
+    ...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+    ...     padding=True,
+    ...     truncation=True,
+    ...     return_tensors="pt"
+    ... )
+    >>> ## TENSORFLOW CODE
+    >>> tf_batch = tokenizer(
+    ...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+    ...     padding=True,
+    ...     truncation=True,
+    ...     return_tensors="tf"
+    ... )

 The padding is automatically applied on the side the model expect it (in this case, on the right), with the
 padding token the model was pretrained with. The attention mask is also adapted to take the padding into account:

-::
+.. code-block::

-    {'input_ids': tensor([[  101,  2057,  2024,  2200,  3407,  2000,  2265,  2017,  1996,   100, 19081,  3075,  1012,   102],
-                          [  101,  2057,  3246,  2017,  2123,  1005,  1056,  5223,  2009,  1012,   102,     0,     0,     0]]), 
-     'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                               [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])}
+    >>> ## PYTORCH CODE
+    >>> for key, value in pt_batch.items():
+    ...     print(f"{key}: {value.numpy().tolist()}")
+    input_ids: [[101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], [101, 2057, 3246, 2017, 2123, 1005, 1056, 5223, 2009, 1012, 102, 0, 0, 0]]
+    attention_mask: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]
+    >>> ## TENSORFLOW CODE
+    >>> for key, value in tf_batch.items():
+    ...     print(f"{key}: {value.numpy().tolist()}")
+    input_ids: [[101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], [101, 2057, 3246, 2017, 2123, 1005, 1056, 5223, 2009, 1012, 102, 0, 0, 0]]
+    attention_mask: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]

 You can learn more about tokenizers :doc:`here <preprocessing>`.

@@ -213,20 +215,27 @@ Once your input has been preprocessed by the tokenizer, you can directly send it
 contain all the relevant information the model needs. If you're using a TensorFlow model, you can directly pass the
 dictionary keys to tensor, for a PyTorch model, you need to unpack the dictionary by adding :obj:`**`.

-::
+.. code-block::

-    ## PYTORCH CODE
-    outputs = model(**batch)
-    ## TENSORFLOW CODE
-    outputs = model(batch)
+    >>> ## PYTORCH CODE
+    >>> pt_outputs = pt_model(**pt_batch)
+    >>> ## TENSORFLOW CODE
+    >>> tf_outputs = tf_model(tf_batch)

 In 🤗 Transformers, all outputs are tuples (with only one element potentially). Here, we get a tuple with just the
 final activations of the model.

-::
+.. code-block::

-    (tensor([[-4.1329,  4.3811],
-             [ 0.0818, -0.0418]]),)
+    >>> ## PYTORCH CODE
+    >>> print(pt_outputs)
+    (tensor([[-4.0833,  4.3364],
+            [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>),)
+    >>> ## TENSORFLOW CODE
+    >>> print(tf_outputs)
+    (<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
+    array([[-4.0832963 ,  4.3364134 ],
+           [ 0.08181238, -0.04178794]], dtype=float32)>,)

 .. note::

@@ -235,33 +244,39 @@ final activations of the model.

 Let's apply the SoftMax activation to get predictions.

-::
+.. code-block::

-    ## PYTORCH CODE
-    import torch.nn.functional as F
-    predictions = F.softmax(outputs[0], dim=-1)
-    print(predictions)
-    ## TENSORFLOW CODE
-    predictions = tf.nn.softmax(outputs[0], axis=-1)
-    print(predictions)
+    >>> ## PYTORCH CODE
+    >>> import torch.nn.functional as F
+    >>> pt_predictions = F.softmax(pt_outputs[0], dim=-1)
+    >>> ## TENSORFLOW CODE
+    >>> import tensorflow as tf
+    >>> tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)

 We can see we get the numbers from before:

-::
+.. code-block::

-    tensor([[2.0060e-04, 9.9980e-01],
-            [5.3086e-01, 4.6914e-01]])
+    >>> ## TENSORFLOW CODE
+    >>> print(tf_predictions)
+    tf.Tensor(
+    [[2.2042994e-04 9.9977952e-01]
+     [5.3086078e-01 4.6913919e-01]], shape=(2, 2), dtype=float32)
+    >>> ## PYTORCH CODE
+    >>> print(pt_predictions)
+    tensor([[2.2043e-04, 9.9978e-01],
+            [5.3086e-01, 4.6914e-01]], grad_fn=<SoftmaxBackward>)

 If you have labels, you can provide them to the model, it will return a tuple with the loss and the final activations.

-::
+.. code-block::

-    ## PYTORCH CODE
-    import torch
-    outputs = model(**batch, labels = torch.tensor([1, 0])
-    ## TENSORFLOW CODE
-    import tensorflow as tf
-    outputs = model(batch, labels = tf.constant([1, 0])
+    >>> ## PYTORCH CODE
+    >>> import torch
+    >>> pt_outputs = pt_model(**pt_batch, labels = torch.tensor([1, 0]))
+    >>> ## TENSORFLOW CODE
+    >>> import tensorflow as tf
+    >>> tf_outputs = tf_model(tf_batch, labels = tf.constant([1, 0]))

 Models are standard `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ or
 `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ so you can use them in your usual
@@ -298,12 +313,12 @@ Lastly, you can also ask the model to return all hidden states and all attention

 ::

-    ## PYTORCH CODE
-    outputs = model(**batch, output_hidden_states=True, output_attentions=True)
-    all_hidden_states, all_attentions = outputs[-2:]
-    ## TENSORFLOW CODE
-    outputs = model(batch, output_hidden_states=True, output_attentions=True)
-    all_hidden_states, all_attentions = outputs[-2:]
+    >>> ## PYTORCH CODE
+    >>> pt_outputs = pt_model(**pt_batch, output_hidden_states=True, output_attentions=True)
+    >>> all_hidden_states, all_attentions = pt_outputs[-2:]
+    >>> ## TENSORFLOW CODE
+    >>> tf_outputs = tf_model(tf_batch, output_hidden_states=True, output_attentions=True)
+    >>> all_hidden_states, all_attentions = tf_outputs[-2:]

 Accessing the code
 ^^^^^^^^^^^^^^^^^^
@@ -318,18 +333,18 @@ using the :doc:`DistilBERT </model_doc/distilbert>` architecture. The model auto
 to that specific model, or browse the source code. This is how you would directly instantiate model and tokenizer
 without the auto magic:

-::
+.. code-block::

-    ## PYTORCH CODE
-    from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
-    model_name = "distilbert-base-uncased-finetuned-sst-2-english"
-    model = DistilBertForSequenceClassification.from_pretrained(model_name)
-    tokenizer = DistilBertTokenizer.from_pretrained(model_name)
-    ## TENSORFLOW CODE
-    from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
-    model_name = "distilbert-base-uncased-finetuned-sst-2-english"
-    model = TFDistilBertForSequenceClassification.from_pretrained(model_name)
-    tokenizer = DistilBertTokenizer.from_pretrained(model_name)
+    >>> ## PYTORCH CODE
+    >>> from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
+    >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
+    >>> model = DistilBertForSequenceClassification.from_pretrained(model_name)
+    >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
+    >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
+    >>> model = TFDistilBertForSequenceClassification.from_pretrained(model_name)
+    >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)

 Customizing the model
 ^^^^^^^^^^^^^^^^^^^^^
@@ -345,18 +360,18 @@ Here we use the predefined vocabulary of DistilBERT (hence load the tokenizer wi
 instantiate the model from the configuration instead of using the
 :func:`~transformers.DistilBertForSequenceClassification.from_pretrained` method).

-::
+.. code-block::

-    ## PYTORCH CODE
-    from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
-    config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4*512)
-    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-    model = DistilBertForSequenceClassification(config)
-    ## TENSORFLOW CODE
-    from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForSequenceClassification
-    config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4*512)
-    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-    model = TFDistilBertForSequenceClassification(config)
+    >>> ## PYTORCH CODE
+    >>> from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
+    >>> config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4*512)
+    >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+    >>> model = DistilBertForSequenceClassification(config)
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForSequenceClassification
+    >>> config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4*512)
+    >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+    >>> model = TFDistilBertForSequenceClassification(config)

 For something that only changes the head of the model (for instance, the number of labels), you can still use a
 pretrained model for the body. For instance, let's define a classifier for 10 different labels using a pretrained body.
@@ -364,15 +379,15 @@ We could create a configuration with all the default values and just change the
 can directly pass any argument a configuration would take to the :func:`from_pretrained` method and it will update the
 default configuration with it:

-::
+.. code-block::

-    ## PYTORCH CODE
-    from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
-    model_name = "distilbert-base-uncased"
-    model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=10)
-    tokenizer = DistilBertTokenizer.from_pretrained(model_name)
-    ## TENSORFLOW CODE
-    from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForSequenceClassification
-    model_name = "distilbert-base-uncased"
-    model = TFDistilBertForSequenceClassification.from_pretrained(model_name, num_labels=10)
-    tokenizer = DistilBertTokenizer.from_pretrained(model_name)
+    >>> ## PYTORCH CODE
+    >>> from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
+    >>> model_name = "distilbert-base-uncased"
+    >>> model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=10)
+    >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForSequenceClassification
+    >>> model_name = "distilbert-base-uncased"
+    >>> model = TFDistilBertForSequenceClassification.from_pretrained(model_name, num_labels=10)
+    >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
--- a/docs/source/task_summary.rst
+++ b/docs/source/task_summary.rst
--- a/setup.py
+++ b/setup.py
@@ -86,7 +86,7 @@ extras["all"] = extras["serving"] + ["tensorflow", "torch"]

 extras["testing"] = ["pytest", "pytest-xdist", "timeout-decorator", "psutil"]
 # sphinx-rtd-theme==0.5.0 introduced big changes in the style.
-extras["docs"] = ["recommonmark", "sphinx", "sphinx-markdown-tables", "sphinx-rtd-theme==0.4.3"]
+extras["docs"] = ["recommonmark", "sphinx", "sphinx-markdown-tables", "sphinx-rtd-theme==0.4.3", "sphinx-copybutton"]
 extras["quality"] = [
    "black",
    "isort @ git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort",
--- a/src/transformers/configuration_albert.py
+++ b/src/transformers/configuration_albert.py
@@ -81,22 +81,22 @@ class AlbertConfig(PretrainedConfig):

        Example::

-            from transformers import AlbertConfig, AlbertModel
-            # Initializing an ALBERT-xxlarge style configuration
-            albert_xxlarge_configuration = AlbertConfig()
+            >>> from transformers import AlbertConfig, AlbertModel
+            >>> # Initializing an ALBERT-xxlarge style configuration
+            >>> albert_xxlarge_configuration = AlbertConfig()

-            # Initializing an ALBERT-base style configuration
-            albert_base_configuration = AlbertConfig(
-                hidden_size=768,
-                num_attention_heads=12,
-                intermediate_size=3072,
-            )
+            >>> # Initializing an ALBERT-base style configuration
+            >>> albert_base_configuration = AlbertConfig(
+            ...      hidden_size=768,
+            ...      num_attention_heads=12,
+            ...      intermediate_size=3072,
+            ...  )

-            # Initializing a model from the ALBERT-base style configuration
-            model = AlbertModel(albert_xxlarge_configuration)
+            >>> # Initializing a model from the ALBERT-base style configuration
+            >>> model = AlbertModel(albert_xxlarge_configuration)

-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
    """

    model_type = "albert"
--- a/src/transformers/configuration_bart.py
+++ b/src/transformers/configuration_bart.py
@@ -73,9 +73,13 @@ class BartConfig(PretrainedConfig):
    ):
        r"""
            :class:`~transformers.BartConfig` is the configuration class for `BartModel`.
-            Examples:
-                config = BartConfig.from_pretrained('bart-large')
-                model = BartModel(config)
+
+            Examples::
+
+                >>> from transformers import BartConfig, BartModel
+
+                >>> config = BartConfig.from_pretrained('facebook/bart-large')
+                >>> model = BartModel(config)
        """
        if "hidden_size" in common_kwargs:
            raise ValueError("hidden size is called d_model")
--- a/src/transformers/configuration_bert.py
+++ b/src/transformers/configuration_bert.py
@@ -95,16 +95,16 @@ class BertConfig(PretrainedConfig):

        Example::

-            from transformers import BertModel, BertConfig
+            >>> from transformers import BertModel, BertConfig

-            # Initializing a BERT bert-base-uncased style configuration
-            configuration = BertConfig()
+            >>> # Initializing a BERT bert-base-uncased style configuration
+            >>> configuration = BertConfig()

-            # Initializing a model from the bert-base-uncased style configuration
-            model = BertModel(configuration)
+            >>> # Initializing a model from the bert-base-uncased style configuration
+            >>> model = BertModel(configuration)

-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
    """
    model_type = "bert"

--- a/src/transformers/configuration_ctrl.py
+++ b/src/transformers/configuration_ctrl.py
@@ -66,16 +66,16 @@ class CTRLConfig(PretrainedConfig):

        Example::

-            from transformers import CTRLModel, CTRLConfig
+            >>> from transformers import CTRLModel, CTRLConfig

-            # Initializing a CTRL configuration
-            configuration = CTRLConfig()
+            >>> # Initializing a CTRL configuration
+            >>> configuration = CTRLConfig()

-            # Initializing a model from the configuration
-            model = CTRLModel(configuration)
+            >>> # Initializing a model from the configuration
+            >>> model = CTRLModel(configuration)

-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
    """

    model_type = "ctrl"
--- a/src/transformers/configuration_distilbert.py
+++ b/src/transformers/configuration_distilbert.py
@@ -80,16 +80,16 @@ class DistilBertConfig(PretrainedConfig):

        Example::

-            from transformers import DistilBertModel, DistilBertConfig
+            >>> from transformers import DistilBertModel, DistilBertConfig

-            # Initializing a DistilBERT configuration
-            configuration = DistilBertConfig()
+            >>> # Initializing a DistilBERT configuration
+            >>> configuration = DistilBertConfig()

-            # Initializing a model from the configuration
-            model = DistilBertModel(configuration)
+            >>> # Initializing a model from the configuration
+            >>> model = DistilBertModel(configuration)

-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
    """
    model_type = "distilbert"

--- a/src/transformers/configuration_electra.py
+++ b/src/transformers/configuration_electra.py
@@ -101,16 +101,16 @@ class ElectraConfig(PretrainedConfig):

        Example::

-            from transformers import ElectraModel, ElectraConfig
+            >>> from transformers import ElectraModel, ElectraConfig

-            # Initializing a ELECTRA electra-base-uncased style configuration
-            configuration = ElectraConfig()
+            >>> # Initializing a ELECTRA electra-base-uncased style configuration
+            >>> configuration = ElectraConfig()

-            # Initializing a model from the electra-base-uncased style configuration
-            model = ElectraModel(configuration)
+            >>> # Initializing a model from the electra-base-uncased style configuration
+            >>> model = ElectraModel(configuration)

-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
    """
    model_type = "electra"

--- a/src/transformers/configuration_encoder_decoder.py
+++ b/src/transformers/configuration_encoder_decoder.py
@@ -42,20 +42,20 @@ class EncoderDecoderConfig(PretrainedConfig):

        Example::

-            from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
+            >>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel

-            # Initializing a BERT bert-base-uncased style configuration
-            config_encoder = BertConfig()
-            config_decoder = BertConfig()
+            >>> # Initializing a BERT bert-base-uncased style configuration
+            >>> config_encoder = BertConfig()
+            >>> config_decoder = BertConfig()

-            config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
+            >>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)

-            # Initializing a Bert2Bert model from the bert-base-uncased style configurations
-            model = EncoderDecoderModel(config=config)
+            >>> # Initializing a Bert2Bert model from the bert-base-uncased style configurations
+            >>> model = EncoderDecoderModel(config=config)

-            # Accessing the model configuration
-            config_encoder = model.config.encoder
-            config_decoder  = model.config.decoder
+            >>> # Accessing the model configuration
+            >>> config_encoder = model.config.encoder
+            >>> config_decoder  = model.config.decoder
    """
    model_type = "encoder_decoder"

--- a/src/transformers/configuration_gpt2.py
+++ b/src/transformers/configuration_gpt2.py
@@ -100,16 +100,16 @@ class GPT2Config(PretrainedConfig):

        Example::

-            from transformers import GPT2Model, GPT2Config
+            >>> from transformers import GPT2Model, GPT2Config

-            # Initializing a GPT2 configuration
-            configuration = GPT2Config()
+            >>> # Initializing a GPT2 configuration
+            >>> configuration = GPT2Config()

-            # Initializing a model from the configuration
-            model = GPT2Model(configuration)
+            >>> # Initializing a model from the configuration
+            >>> model = GPT2Model(configuration)

-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
    """

    model_type = "gpt2"
--- a/src/transformers/configuration_longformer.py
+++ b/src/transformers/configuration_longformer.py
@@ -49,16 +49,16 @@ class LongformerConfig(RobertaConfig):

        Example::

-            from transformers import LongformerConfig, LongformerModel
+            >>> from transformers import LongformerConfig, LongformerModel

-            # Initializing a Longformer configuration
-            configuration = LongformerConfig()
+            >>> # Initializing a Longformer configuration
+            >>> configuration = LongformerConfig()

-            # Initializing a model from the configuration
-            model = LongformerModel(configuration)
+            >>> # Initializing a model from the configuration
+            >>> model = LongformerModel(configuration)

-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
    """
    model_type = "longformer"

--- a/src/transformers/configuration_mobilebert.py
+++ b/src/transformers/configuration_mobilebert.py
@@ -85,16 +85,16 @@ class MobileBertConfig(PretrainedConfig):

        Example:

-            from transformers import MobileBertModel, MobileBertConfig
+            >>> from transformers import MobileBertModel, MobileBertConfig

-            # Initializing a MobileBERT configuration
-            configuration = MobileBertConfig()
+            >>> # Initializing a MobileBERT configuration
+            >>> configuration = MobileBertConfig()

-            # Initializing a model from the configuration above
-            model = MobileBertModel(configuration)
+            >>> # Initializing a model from the configuration above
+            >>> model = MobileBertModel(configuration)

-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config

        Attributes:
            pretrained_config_archive_map (Dict[str, str]):
--- a/src/transformers/configuration_openai.py
+++ b/src/transformers/configuration_openai.py
@@ -98,16 +98,16 @@ class OpenAIGPTConfig(PretrainedConfig):

        Example::

-            from transformers import OpenAIGPTConfig, OpenAIGPTModel
+            >>> from transformers import OpenAIGPTConfig, OpenAIGPTModel

-            # Initializing a GPT configuration
-            configuration = OpenAIGPTConfig()
+            >>> # Initializing a GPT configuration
+            >>> configuration = OpenAIGPTConfig()

-            # Initializing a model from the configuration
-            model = OpenAIGPTModel(configuration)
+            >>> # Initializing a model from the configuration
+            >>> model = OpenAIGPTModel(configuration)

-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
    """

    model_type = "openai-gpt"
--- a/src/transformers/configuration_reformer.py
+++ b/src/transformers/configuration_reformer.py
@@ -125,16 +125,16 @@ class ReformerConfig(PretrainedConfig):

        Example::

-            from transformers import ReformerModel, ReformerConfig
+            >>> from transformers import ReformerModel, ReformerConfig

-            # Initializing a Reformer configuration
-            configuration = ReformerConfig()
+            >>> # Initializing a Reformer configuration
+            >>> configuration = ReformerConfig()

-            # Initializing a Reformer model
-            model = ReformerModel(configuration)
+            >>> # Initializing a Reformer model
+            >>> model = ReformerModel(configuration)

-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
    """
    model_type = "reformer"

--- a/src/transformers/configuration_roberta.py
+++ b/src/transformers/configuration_roberta.py
@@ -49,16 +49,16 @@ class RobertaConfig(BertConfig):

        Example::

-            from transformers import RobertaConfig, RobertaModel
+            >>> from transformers import RobertaConfig, RobertaModel

-            # Initializing a RoBERTa configuration
-            configuration = RobertaConfig()
+            >>> # Initializing a RoBERTa configuration
+            >>> configuration = RobertaConfig()

-            # Initializing a model from the configuration
-            model = RobertaModel(configuration)
+            >>> # Initializing a model from the configuration
+            >>> model = RobertaModel(configuration)

-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
    """
    model_type = "roberta"

--- a/src/transformers/configuration_transfo_xl.py
+++ b/src/transformers/configuration_transfo_xl.py
@@ -100,16 +100,16 @@ class TransfoXLConfig(PretrainedConfig):

        Example::

-            from transformers import TransfoXLConfig, TransfoXLModel
+            >>> from transformers import TransfoXLConfig, TransfoXLModel

-            # Initializing a Transformer XL configuration
-            configuration = TransfoXLConfig()
+            >>> # Initializing a Transformer XL configuration
+            >>> configuration = TransfoXLConfig()

-            # Initializing a model from the configuration
-            model = TransfoXLModel(configuration)
+            >>> # Initializing a model from the configuration
+            >>> model = TransfoXLModel(configuration)

-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
    """

    model_type = "transfo-xl"
--- a/src/transformers/configuration_xlm.py
+++ b/src/transformers/configuration_xlm.py
@@ -142,16 +142,16 @@ class XLMConfig(PretrainedConfig):

        Example::

-            from transformers import XLMConfig, XLMModel
+            >>> from transformers import XLMConfig, XLMModel

-            # Initializing a XLM configuration
-            configuration = XLMConfig()
+            >>> # Initializing a XLM configuration
+            >>> configuration = XLMConfig()

-            # Initializing a model from the configuration
-            model = XLMModel(configuration)
+            >>> # Initializing a model from the configuration
+            >>> model = XLMModel(configuration)

-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
    """

    model_type = "xlm"
--- a/src/transformers/configuration_xlnet.py
+++ b/src/transformers/configuration_xlnet.py
@@ -113,16 +113,16 @@ class XLNetConfig(PretrainedConfig):

        Example::

-            from transformers import XLNetConfig, XLNetModel
+            >>> from transformers import XLNetConfig, XLNetModel

-            # Initializing a XLNet configuration
-            configuration = XLNetConfig()
+            >>> # Initializing a XLNet configuration
+            >>> configuration = XLNetConfig()

-            # Initializing a model from the configuration
-            model = XLNetModel(configuration)
+            >>> # Initializing a model from the configuration
+            >>> model = XLNetModel(configuration)

-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
    """

    model_type = "xlnet"
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -488,11 +488,11 @@ class SquadProcessor(DataProcessor):

        Examples::

-            import tensorflow_datasets as tfds
-            dataset = tfds.load("squad")
+            >>> import tensorflow_datasets as tfds
+            >>> dataset = tfds.load("squad")

-            training_examples = get_examples_from_dataset(dataset, evaluate=False)
-            evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
+            >>> training_examples = get_examples_from_dataset(dataset, evaluate=False)
+            >>> evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
        """

        if evaluate:
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -186,6 +186,263 @@ def add_end_docstrings(*docstr):
    return docstring_decorator


+PT_TOKEN_CLASSIFICATION_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0)  # Batch size 1
+
+        >>> outputs = model(**inputs, labels=labels)
+        >>> loss, scores = outputs[:2]
+"""
+
+PT_QUESTION_ANSWERING_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> start_positions = torch.tensor([1])
+        >>> end_positions = torch.tensor([3])
+
+        >>> outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
+        >>> loss, start_scores, end_scores = outputs[:3]
+"""
+
+PT_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(**inputs, labels=labels)
+        >>> loss, logits = outputs[:2]
+"""
+
+PT_MASKED_LM_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"]
+
+        >>> outputs = model(input_ids, labels=input_ids)
+        >>> loss, prediction_scores = outputs[:2]
+"""
+
+PT_BASE_MODEL_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+"""
+
+PT_MULTIPLE_CHOICE_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> choice0 = "It is eaten with a fork and a knife."
+        >>> choice1 = "It is eaten while held in the hand."
+        >>> labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
+
+        >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', pad_to_max_length=True)
+        >>> outputs = model(**{{k: v.unsqueeze(0) for k,v in encoding.items()}}, labels=labels)  # batch size is 1
+
+        >>> # the linear classifier still needs to be trained
+        >>> loss, logits = outputs[:2]
+"""
+
+PT_CAUSAL_LM_SAMPLE = r"""
+    Example::
+
+        >>> import torch
+        >>> from transformers import {tokenizer_class}, {model_class}
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs, labels=inputs["input_ids"])
+        >>> loss, logits = outputs[:2]
+"""
+
+TF_TOKEN_CLASSIFICATION_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+        >>> input_ids = inputs["input_ids"]
+        >>> inputs["labels"] = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
+
+        >>> outputs = model(inputs)
+        >>> loss, scores = outputs[:2]
+"""
+
+TF_QUESTION_ANSWERING_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+        >>> input_dict = tokenizer(question, text, return_tensors='tf')
+        >>> start_scores, end_scores = model(input_dict)
+
+        >>> all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
+        >>> answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
+"""
+
+TF_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+        >>> inputs["labels"] = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
+
+        >>> outputs = model(inputs)
+        >>> loss, logits = outputs[:2]
+"""
+
+TF_MASKED_LM_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
+
+        >>> outputs = model(input_ids)
+        >>> prediction_scores = outputs[0]
+"""
+
+TF_BASE_MODEL_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+        >>> outputs = model(inputs)
+
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+"""
+
+TF_MULTIPLE_CHOICE_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> choice0 = "It is eaten with a fork and a knife."
+        >>> choice1 = "It is eaten while held in the hand."
+
+        >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='tf', pad_to_max_length=True)
+        >>> inputs = {{k: tf.expand_dims(v, 0) for k, v in encoding.items()}}
+        >>> outputs = model(inputs)  # batch size is 1
+
+        >>> # the linear classifier still needs to be trained
+        >>> logits = outputs[0]
+"""
+
+TF_CAUSAL_LM_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+        >>> outputs = model(inputs)
+        >>> logits = outputs[0]
+"""
+
+
+def add_code_sample_docstrings(*docstr, tokenizer_class=None, checkpoint=None):
+    def docstring_decorator(fn):
+        model_class = fn.__qualname__.split(".")[0]
+        is_tf_class = model_class[:2] == "TF"
+
+        if "SequenceClassification" in model_class:
+            code_sample = TF_SEQUENCE_CLASSIFICATION_SAMPLE if is_tf_class else PT_SEQUENCE_CLASSIFICATION_SAMPLE
+        elif "QuestionAnswering" in model_class:
+            code_sample = TF_QUESTION_ANSWERING_SAMPLE if is_tf_class else PT_QUESTION_ANSWERING_SAMPLE
+        elif "TokenClassification" in model_class:
+            code_sample = TF_TOKEN_CLASSIFICATION_SAMPLE if is_tf_class else PT_TOKEN_CLASSIFICATION_SAMPLE
+        elif "MultipleChoice" in model_class:
+            code_sample = TF_MULTIPLE_CHOICE_SAMPLE if is_tf_class else PT_MULTIPLE_CHOICE_SAMPLE
+        elif "MaskedLM" in model_class:
+            code_sample = TF_MASKED_LM_SAMPLE if is_tf_class else PT_MASKED_LM_SAMPLE
+        elif "LMHead" in model_class:
+            code_sample = TF_CAUSAL_LM_SAMPLE if is_tf_class else PT_CAUSAL_LM_SAMPLE
+        elif "Model" in model_class:
+            code_sample = TF_BASE_MODEL_SAMPLE if is_tf_class else PT_BASE_MODEL_SAMPLE
+        else:
+            raise ValueError(f"Docstring can't be built for model {model_class}")
+
+        built_doc = code_sample.format(model_class=model_class, tokenizer_class=tokenizer_class, checkpoint=checkpoint)
+        fn.__doc__ = (fn.__doc__ or "") + "".join(docstr) + built_doc
+        return fn
+
+    return docstring_decorator
+
+
 def is_remote_url(url_or_filename):
    parsed = urlparse(url_or_filename)
    return parsed.scheme in ("http", "https")
--- a/src/transformers/modeling_albert.py
+++ b/src/transformers/modeling_albert.py
@@ -24,13 +24,15 @@ import torch.nn as nn
 from torch.nn import CrossEntropyLoss, MSELoss

 from .configuration_albert import AlbertConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_bert import ACT2FN, BertEmbeddings, BertSelfAttention, prune_linear_layer
 from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices


 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "AlbertTokenizer"
+

 ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "albert-base-v1",
@@ -485,6 +487,7 @@ class AlbertModel(AlbertPreTrainedModel):
            self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads)

    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
    def forward(
        self,
        input_ids=None,
@@ -521,18 +524,6 @@ class AlbertModel(AlbertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Example::
-
-        from transformers import AlbertModel, AlbertTokenizer
-        import torch
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = AlbertModel.from_pretrained('albert-base-v2')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -657,16 +648,16 @@ class AlbertForPreTraining(AlbertPreTrainedModel):

    Examples::

-        from transformers import AlbertTokenizer, AlbertForPreTraining
-        import torch
+        >>> from transformers import AlbertTokenizer, AlbertForPreTraining
+        >>> import torch

-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = AlbertForPreTraining.from_pretrained('albert-base-v2')
+        >>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
+        >>> model = AlbertForPreTraining.from_pretrained('albert-base-v2')

-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)

-        prediction_scores, sop_scores = outputs[:2]
+        >>> prediction_scores, sop_scores = outputs[:2]

        """

@@ -763,6 +754,7 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
        return self.predictions.decoder

    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
    def forward(
        self,
        input_ids=None,
@@ -802,18 +794,6 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Example::
-
-        from transformers import AlbertTokenizer, AlbertForMaskedLM
-        import torch
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = AlbertForMaskedLM.from_pretrained('albert-base-v2')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, prediction_scores = outputs[:2]
-
        """
        if "masked_lm_labels" in kwargs:
            warnings.warn(
@@ -863,6 +843,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
    def forward(
        self,
        input_ids=None,
@@ -899,19 +880,6 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-        Examples::
-
-            from transformers import AlbertTokenizer, AlbertForSequenceClassification
-            import torch
-
-            tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-            model = AlbertForSequenceClassification.from_pretrained('albert-base-v2')
-            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-            labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-            outputs = model(input_ids, labels=labels)
-            loss, logits = outputs[:2]
-
        """

        outputs = self.albert(
@@ -962,6 +930,7 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
    def forward(
        self,
        input_ids=None,
@@ -996,21 +965,6 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import AlbertTokenizer, AlbertForTokenClassification
-        import torch
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = AlbertForTokenClassification.from_pretrained('albert-base-v2')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-
-        loss, scores = outputs[:2]
-
        """

        outputs = self.albert(
@@ -1062,6 +1016,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
    def forward(
        self,
        input_ids=None,
@@ -1104,21 +1059,6 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        # The checkpoint albert-base-v2 is not fine-tuned for question answering. Please see the
-        # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
-
-        from transformers import AlbertTokenizer, AlbertForQuestionAnswering
-        import torch
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = AlbertForQuestionAnswering.from_pretrained('albert-base-v2')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='pt')
-        start_scores, end_scores = model(**input_dict)
-
        """

        outputs = self.albert(
@@ -1176,6 +1116,7 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
    def forward(
        self,
        input_ids=None,
@@ -1213,25 +1154,6 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import AlbertTokenizer, AlbertForMultipleChoice
-        import torch
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = AlbertForMultipleChoice.from_pretrained('albert-base-v2')
-
-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        choice0 = "It is eaten with a fork and a knife."
-        choice1 = "It is eaten while held in the hand."
-        labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
-
-        encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
-        outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
-
-        # the linear classifier still needs to be trained
-        loss, logits = outputs[:2]
        """
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

--- a/src/transformers/modeling_auto.py
+++ b/src/transformers/modeling_auto.py
@@ -392,8 +392,8 @@ class AutoModel:

        Examples::

-            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            model = AutoModel.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            >>> config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            >>> model = AutoModel.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
        """
        for config_class, model_class in MODEL_MAPPING.items():
            if isinstance(config, config_class):
@@ -480,8 +480,7 @@ class AutoModel:
        Examples::

            model = AutoModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = AutoModel.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            assert model.config.output_attention == True
+            assert model.config.output_attentions == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
            model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
@@ -547,8 +546,8 @@ class AutoModelForPreTraining:

        Examples::

-            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            model = AutoModelForPreTraining.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            >>> config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            >>> model = AutoModelForPreTraining.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
        """
        for config_class, model_class in MODEL_FOR_PRETRAINING_MAPPING.items():
            if isinstance(config, config_class):
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -27,12 +27,19 @@ from torch.nn import CrossEntropyLoss

 from .activations import ACT2FN
 from .configuration_bart import BartConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_utils import PreTrainedModel


 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "BartTokenizer"
+

 BART_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/bart-large",
@@ -56,14 +63,17 @@ BART_START_DOCSTRING = r"""

 """
 BART_GENERATION_EXAMPLE = r"""
-    Examples::
+    Summarization example::

        from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
+
        # see ``examples/summarization/bart/run_eval.py`` for a longer example
        model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
        tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+
        ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
-        inputs = tokenizer.batch_encode_plus([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
+        inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
+
        # Generate Summary
        summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
        print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
@@ -807,6 +817,7 @@ class BartModel(PretrainedBartModel):
        self.init_weights()

    @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="facebook/bart-large")
    def forward(
        self,
        input_ids,
@@ -883,8 +894,7 @@ class BartModel(PretrainedBartModel):


@add_start_docstrings(
-    "The BART Model with a language modeling head. Can be used for summarization.",
-    BART_START_DOCSTRING + BART_GENERATION_EXAMPLE,
+    "The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING
 )
 class BartForConditionalGeneration(PretrainedBartModel):
    base_model_prefix = "model"
@@ -911,6 +921,7 @@ class BartForConditionalGeneration(PretrainedBartModel):
        self.register_buffer("final_logits_bias", new_bias)

    @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
+    @add_end_docstrings(BART_GENERATION_EXAMPLE)
    def forward(
        self,
        input_ids,
@@ -951,18 +962,21 @@ class BartForConditionalGeneration(PretrainedBartModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

-    Examples::
+    Conditional generation example::

            # Mask filling only works for bart-large
            from transformers import BartTokenizer, BartForConditionalGeneration
-            tokenizer = BartTokenizer.from_pretrained('bart-large')
+            tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
            TXT = "My friends are <mask> but they eat too many carbs."
-            model = BartForConditionalGeneration.from_pretrained('bart-large')
-            input_ids = tokenizer.batch_encode_plus([TXT], return_tensors='pt')['input_ids']
+
+            model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
+            input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
            logits = model(input_ids)[0]
+
            masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
            probs = logits[0, masked_index].softmax(dim=0)
            values, predictions = probs.topk(5)
+
            tokenizer.decode(predictions).split()
            # ['good', 'great', 'all', 'really', 'very']
        """
@@ -1068,6 +1082,7 @@ class BartForSequenceClassification(PretrainedBartModel):
        self.model._init_weights(self.classification_head.out_proj)

    @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="facebook/bart-large")
    def forward(
        self,
        input_ids,
@@ -1088,32 +1103,19 @@ class BartForSequenceClassification(PretrainedBartModel):

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BartConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
-            Classification loss (cross entropy)
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import BartTokenizer, BartForSequenceClassification
-        import torch
-
-        tokenizer = BartTokenizer.from_pretrained('bart-large')
-        model = BartForSequenceClassification.from_pretrained('bart-large')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute",
-        add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
+            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
+                Classification loss (cross entropy)
+            logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+                Classification (or regression if config.num_labels==1) scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Attentions weights after the attention softmax, used to compute the weighted average in the
+                self-attention
+                heads.
        """
        if labels is not None:
            use_cache = False
@@ -1161,6 +1163,7 @@ class BartForQuestionAnswering(PretrainedBartModel):
        self.model._init_weights(self.qa_outputs)

    @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="facebook/bart-large")
    def forward(
        self,
        input_ids,
@@ -1200,25 +1203,6 @@ class BartForQuestionAnswering(PretrainedBartModel):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        # The checkpoint bart-large is not fine-tuned for question answering. Please see the
-        # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
-
-        from transformers import BartTokenizer, BartForQuestionAnswering
-        import torch
-
-        tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
-        model = BartForQuestionAnswering.from_pretrained('facebook/bart-large')
-
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_ids = tokenizer.encode(question, text)
-        start_scores, end_scores = model(torch.tensor([input_ids]))
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-        answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
-
        """
        if start_positions is not None and end_positions is not None:
            use_cache = False
@@ -1259,7 +1243,7 @@ class BartForQuestionAnswering(PretrainedBartModel):
            total_loss = (start_loss + end_loss) / 2
            outputs = (total_loss,) + outputs

-        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
+        return outputs  # return outputs  # (loss), start_logits, end_logits, encoder_outputs, (hidden_states), (attentions)


 class SinusoidalPositionalEmbedding(nn.Embedding):
--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -28,12 +28,14 @@ from torch.nn import CrossEntropyLoss, MSELoss

 from .activations import gelu, gelu_new, swish
 from .configuration_bert import BertConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer


 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "BertTokenizer"
+
 BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "bert-base-uncased",
    "bert-large-uncased",
@@ -664,6 +666,7 @@ class BertModel(BertPreTrainedModel):
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
    def forward(
        self,
        input_ids=None,
@@ -702,20 +705,6 @@ class BertModel(BertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import BertModel, BertTokenizer
-        import torch
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertModel.from_pretrained('bert-base-uncased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
@@ -851,16 +840,16 @@ class BertForPreTraining(BertPreTrainedModel):

    Examples::

-        from transformers import BertTokenizer, BertForPreTraining
-        import torch
+        >>> from transformers import BertTokenizer, BertForPreTraining
+        >>> import torch

-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForPreTraining.from_pretrained('bert-base-uncased')
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> model = BertForPreTraining.from_pretrained('bert-base-uncased')

-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)

-        prediction_scores, seq_relationship_scores = outputs[:2]
+        >>> prediction_scores, seq_relationship_scores = outputs[:2]

        """
        if "masked_lm_labels" in kwargs:
@@ -958,19 +947,20 @@ class BertLMHeadModel(BertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

-        Examples::
+    Example::

-            from transformers import BertTokenizer, BertLMHeadModel
-            import torch
+        >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+        >>> import torch

-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-            model = BertLMHeadModel.from_pretrained('bert-base-uncased', is_decoder=True)
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+        >>> config = BertConfig.from_pretrained("bert-base-cased")
+        >>> config.is_decoder = True
+        >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)

-            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-            outputs = model(input_ids, labels=input_ids)
-
-            loss, prediction_scores = outputs[:2]
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)

+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """

        outputs = self.bert(
@@ -1028,6 +1018,7 @@ class BertForMaskedLM(BertPreTrainedModel):
        return self.cls.predictions.decoder

    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
    def forward(
        self,
        input_ids=None,
@@ -1069,20 +1060,6 @@ class BertForMaskedLM(BertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-        Examples::
-
-            from transformers import BertTokenizer, BertForMaskedLM
-            import torch
-
-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-            model = BertForMaskedLM.from_pretrained('bert-base-uncased')
-
-            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-            outputs = model(input_ids, labels=input_ids)
-
-            loss, prediction_scores = outputs[:2]
-
        """
        if "masked_lm_labels" in kwargs:
            warnings.warn(
@@ -1185,18 +1162,18 @@ class BertForNextSentencePrediction(BertPreTrainedModel):

    Examples::

-        from transformers import BertTokenizer, BertForNextSentencePrediction
-        import torch
+        >>> from transformers import BertTokenizer, BertForNextSentencePrediction
+        >>> import torch

-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='pt')
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')

-        loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
-        assert logits[0, 0] < logits[0, 1] # next sentence was random
+        >>> loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
+        >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
        """

        outputs = self.bert(
@@ -1240,6 +1217,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
    def forward(
        self,
        input_ids=None,
@@ -1276,21 +1254,6 @@ class BertForSequenceClassification(BertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import BertTokenizer, BertForSequenceClassification
-        import torch
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-
-        loss, logits = outputs[:2]
-
        """

        outputs = self.bert(
@@ -1340,6 +1303,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
    def forward(
        self,
        input_ids=None,
@@ -1377,25 +1341,6 @@ class BertForMultipleChoice(BertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import BertTokenizer, BertForMultipleChoice
-        import torch
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
-
-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        choice0 = "It is eaten with a fork and a knife."
-        choice1 = "It is eaten while held in the hand."
-        labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
-
-        encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
-        outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
-
-        # the linear classifier still needs to be trained
-        loss, logits = outputs[:2]
        """
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

@@ -1453,6 +1398,7 @@ class BertForTokenClassification(BertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
    def forward(
        self,
        input_ids=None,
@@ -1487,21 +1433,6 @@ class BertForTokenClassification(BertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import BertTokenizer, BertForTokenClassification
-        import torch
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForTokenClassification.from_pretrained('bert-base-uncased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-
-        loss, scores = outputs[:2]
-
        """

        outputs = self.bert(
@@ -1554,6 +1485,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
    def forward(
        self,
        input_ids=None,
@@ -1596,25 +1528,6 @@ class BertForQuestionAnswering(BertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import BertTokenizer, BertForQuestionAnswering
-        import torch
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
-
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        encoding = tokenizer.encode_plus(question, text)
-        input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"]
-        start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-        answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
-
-        assert answer == "a nice puppet"
-
        """

        outputs = self.bert(
--- a/src/transformers/modeling_camembert.py
+++ b/src/transformers/modeling_camembert.py
@@ -31,6 +31,8 @@ from .modeling_roberta import (

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "CamembertTokenizer"
+
 CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "camembert-base",
    "Musixmatch/umberto-commoncrawl-cased-v1",
--- a/src/transformers/modeling_ctrl.py
+++ b/src/transformers/modeling_ctrl.py
@@ -24,12 +24,14 @@ import torch.nn as nn
 from torch.nn import CrossEntropyLoss

 from .configuration_ctrl import CTRLConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import Conv1D, PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer


 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "CTRLTokenizer"
+
 CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "ctrl"
    # See all CTRL models at https://huggingface.co/models?filter=ctrl
@@ -326,6 +328,7 @@ class CTRLModel(CTRLPreTrainedModel):
            self.h[layer].multi_head_attention.prune_heads(heads)

    @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
    def forward(
        self,
        input_ids=None,
@@ -358,20 +361,6 @@ class CTRLModel(CTRLPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import CTRLTokenizer, CTRLModel
-        import torch
-
-        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
-        model = CTRLModel.from_pretrained('ctrl')
-
-        input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        use_cache = use_cache if use_cache is not None else self.config.use_cache
@@ -510,6 +499,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
        return {"input_ids": input_ids, "past": past, "use_cache": kwargs["use_cache"]}

    @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
    def forward(
        self,
        input_ids=None,
@@ -552,19 +542,6 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import torch
-        from transformers import CTRLTokenizer, CTRLLMHeadModel
-
-        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
-        model = CTRLLMHeadModel.from_pretrained('ctrl')
-
-        input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, logits = outputs[:2]
-
        """
        transformer_outputs = self.transformer(
            input_ids,
--- a/src/transformers/modeling_distilbert.py
+++ b/src/transformers/modeling_distilbert.py
@@ -30,12 +30,13 @@ from torch.nn import CrossEntropyLoss

 from .activations import gelu
 from .configuration_distilbert import DistilBertConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer


 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "DistilBertTokenizer"

 DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "distilbert-base-uncased",
@@ -409,6 +410,7 @@ class DistilBertModel(DistilBertPreTrainedModel):
            self.transformer.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
    def forward(
        self,
        input_ids=None,
@@ -434,20 +436,6 @@ class DistilBertModel(DistilBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import DistilBertTokenizer, DistilBertModel
-        import torch
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = DistilBertModel.from_pretrained('distilbert-base-cased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
@@ -506,6 +494,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
        return self.vocab_projector

    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
    def forward(
        self,
        input_ids=None,
@@ -544,17 +533,6 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

-    Examples::
-
-        from transformers import DistilBertTokenizer, DistilBertForMaskedLM
-        import torch
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = DistilBertForMaskedLM.from_pretrained('distilbert-base-cased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, prediction_scores = outputs[:2]
-
        """
        if "masked_lm_labels" in kwargs:
            warnings.warn(
@@ -604,6 +582,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
    def forward(
        self,
        input_ids=None,
@@ -639,18 +618,6 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

-    Examples::
-
-        from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
-        import torch
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
        """
        distilbert_output = self.distilbert(
            input_ids=input_ids,
@@ -697,6 +664,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
    def forward(
        self,
        input_ids=None,
@@ -737,20 +705,6 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
-        import torch
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        start_positions = torch.tensor([1])
-        end_positions = torch.tensor([3])
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss, start_scores, end_scores = outputs[:3]
-
        """
        distilbert_output = self.distilbert(
            input_ids=input_ids,
@@ -806,6 +760,7 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
    def forward(
        self,
        input_ids=None,
@@ -838,19 +793,6 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import DistilBertTokenizer, DistilBertForTokenClassification
-        import torch
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
        """

        outputs = self.distilbert(
@@ -940,22 +882,23 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):

    Examples::

-        from transformers import DistilBertTokenizer, DistilBertForMultipleChoice
-        import torch
+        >>> from transformers import DistilBertTokenizer, DistilBertForMultipleChoice
+        >>> import torch

-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased')
+        >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
+        >>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased')

-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        choice0 = "It is eaten with a fork and a knife."
-        choice1 = "It is eaten while held in the hand."
-        labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> choice0 = "It is eaten with a fork and a knife."
+        >>> choice1 = "It is eaten while held in the hand."
+        >>> labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1

-        encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
-        outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
+        >>> encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
+        >>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
+
+        >>> # the linear classifier still needs to be trained
+        >>> loss, logits = outputs[:2]

-        # the linear classifier still needs to be trained
-        loss, logits = outputs[:2]
        """
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

--- a/src/transformers/modeling_electra.py
+++ b/src/transformers/modeling_electra.py
@@ -8,13 +8,14 @@ from torch.nn import CrossEntropyLoss, MSELoss

 from .activations import get_activation
 from .configuration_electra import ElectraConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_bert import BertEmbeddings, BertEncoder, BertLayerNorm, BertPreTrainedModel
 from .modeling_utils import SequenceSummary


 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "ElectraTokenizer"

 ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "google/electra-small-generator",
@@ -264,6 +265,7 @@ class ElectraModel(ElectraPreTrainedModel):
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
    def forward(
        self,
        input_ids=None,
@@ -291,20 +293,6 @@ class ElectraModel(ElectraPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import ElectraModel, ElectraTokenizer
-        import torch
-
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
-        model = ElectraModel.from_pretrained('google/electra-small-discriminator')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -383,6 +371,7 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
    def forward(
        self,
        input_ids=None,
@@ -419,21 +408,6 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import BertTokenizer, BertForSequenceClassification
-        import torch
-
-        tokenizer = ElectraTokenizer.from_pretrained('bert-base-uncased')
-        model = ElectraForSequenceClassification.from_pretrained('bert-base-uncased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-
-        loss, logits = outputs[:2]
-
        """
        discriminator_hidden_states = self.electra(
            input_ids,
@@ -521,16 +495,14 @@ class ElectraForPreTraining(ElectraPreTrainedModel):

    Examples::

-        from transformers import ElectraTokenizer, ElectraForPreTraining
-        import torch
+        >>> from transformers import ElectraTokenizer, ElectraForPreTraining
+        >>> import torch

-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
-        model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
+        >>> tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
+        >>> model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator')

-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-
-        prediction_scores, seq_relationship_scores = outputs[:2]
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        >>> scores = model(input_ids)[0]

        """

@@ -589,6 +561,7 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
        return self.generator_lm_head

    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-generator")
    def forward(
        self,
        input_ids=None,
@@ -628,20 +601,6 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-        Examples::
-
-            from transformers import ElectraTokenizer, ElectraForMaskedLM
-            import torch
-
-            tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
-            model = ElectraForMaskedLM.from_pretrained('google/electra-small-generator')
-
-            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-            outputs = model(input_ids, labels=input_ids)
-
-            loss, prediction_scores = outputs[:2]
-
        """
        if "masked_lm_labels" in kwargs:
            warnings.warn(
@@ -696,6 +655,7 @@ class ElectraForTokenClassification(ElectraPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
    def forward(
        self,
        input_ids=None,
@@ -730,21 +690,6 @@ class ElectraForTokenClassification(ElectraPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import ElectraTokenizer, ElectraForTokenClassification
-        import torch
-
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
-        model = ElectraForTokenClassification.from_pretrained('google/electra-small-discriminator')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-
-        loss, scores = outputs[:2]
-
        """

        discriminator_hidden_states = self.electra(
@@ -802,6 +747,7 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
    def forward(
        self,
        input_ids=None,
@@ -844,23 +790,6 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import ElectraTokenizer, ElectraForQuestionAnswering
-        import torch
-
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
-        model = ElectraForQuestionAnswering.from_pretrained('google/electra-base-discriminator')
-
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        encoding = tokenizer.encode_plus(question, text, return_tensors='pt')
-        input_ids, token_type_ids = encoding['input_ids'], encoding['token_type_ids']
-        start_scores, end_scores = model(input_ids, token_type_ids=token_type_ids)
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze(0))
-        answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
-
        """

        discriminator_hidden_states = self.electra(
@@ -918,6 +847,7 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
    def forward(
        self,
        input_ids=None,
@@ -954,25 +884,6 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import ElectraTokenizer, ElectraForMultipleChoice
-        import torch
-
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
-        model = ElectraForMultipleChoice.from_pretrained('google/electra-base-discriminator')
-
-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        choice0 = "It is eaten with a fork and a knife."
-        choice1 = "It is eaten while held in the hand."
-        labels = torch.tensor(0) # choice0 is correct (according to Wikipedia ;))
-
-        encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
-        outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
-
-        # the linear classifier still needs to be trained
-        loss, logits = outputs[:2]
        """
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

--- a/src/transformers/modeling_encoder_decoder.py
+++ b/src/transformers/modeling_encoder_decoder.py
@@ -126,9 +126,8 @@ class EncoderDecoderModel(PreTrainedModel):

        Examples::

-            from transformers import EncoderDecoder
-
-            model = EncoderDecoder.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
+            >>> from transformers import EncoderDecoderModel
+            >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
        """

        kwargs_encoder = {
@@ -244,21 +243,21 @@ class EncoderDecoderModel(PreTrainedModel):

        Examples::

-            from transformers import EncoderDecoderModel, BertTokenizer
-            import torch
+            >>> from transformers import EncoderDecoderModel, BertTokenizer
+            >>> import torch

-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-            model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert

-            # forward
-            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-            outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
+            >>> # forward
+            >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+            >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)

-            # training
-            loss, outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, lm_labels=input_ids)[:2]
+            >>> # training
+            >>> loss, outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)[:2]

-            # generation
-            generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.pad_token_id)
+            >>> # generation
+            >>> generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.pad_token_id)

        """

--- a/src/transformers/modeling_flaubert.py
+++ b/src/transformers/modeling_flaubert.py
@@ -22,7 +22,7 @@ import torch
 from torch.nn import functional as F

 from .configuration_flaubert import FlaubertConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_xlm import (
    XLMForQuestionAnswering,
    XLMForQuestionAnsweringSimple,
@@ -35,6 +35,8 @@ from .modeling_xlm import (

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "FlaubertTokenizer"
+
 FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "flaubert/flaubert_small_cased",
    "flaubert/flaubert_base_uncased",
@@ -119,6 +121,7 @@ class FlaubertModel(XLMModel):
        self.pre_norm = getattr(config, "pre_norm", False)

    @add_start_docstrings_to_callable(FLAUBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="flaubert/flaubert_base_cased")
    def forward(
        self,
        input_ids=None,
@@ -149,18 +152,6 @@ class FlaubertModel(XLMModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import FlaubertTokenizer, FlaubertModel
-        import torch
-
-        tokenizer = FlaubertTokenizer.from_pretrained('flaubert-base-cased')
-        model = FlaubertModel.from_pretrained('flaubert-base-cased')
-        input_ids = torch.tensor(tokenizer.encode("Le chat mange une pomme.", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
--- a/src/transformers/modeling_gpt2.py
+++ b/src/transformers/modeling_gpt2.py
@@ -26,7 +26,7 @@ from torch.nn import CrossEntropyLoss

 from .activations import ACT2FN
 from .configuration_gpt2 import GPT2Config
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import (
    Conv1D,
    PreTrainedModel,
@@ -38,6 +38,8 @@ from .modeling_utils import (

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
+
 GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "gpt2",
    "gpt2-medium",
@@ -370,6 +372,7 @@ class GPT2Model(GPT2PreTrainedModel):
            self.h[layer].attn.prune_heads(heads)

    @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
    def forward(
        self,
        input_ids=None,
@@ -403,18 +406,6 @@ class GPT2Model(GPT2PreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import GPT2Tokenizer, GPT2Model
-        import torch
-
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = GPT2Model.from_pretrained('gpt2')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
@@ -553,6 +544,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
        return {"input_ids": input_ids, "past": past, "use_cache": kwargs["use_cache"]}

    @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
    def forward(
        self,
        input_ids=None,
@@ -595,19 +587,6 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import torch
-        from transformers import GPT2Tokenizer, GPT2LMHeadModel
-
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = GPT2LMHeadModel.from_pretrained('gpt2')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, logits = outputs[:2]
-
        """
        transformer_outputs = self.transformer(
            input_ids,
@@ -721,26 +700,26 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):

    Examples::

-        import torch
-        from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
+        >>> import torch
+        >>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel

-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
+        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        >>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2')

-        # Add a [CLS] to the vocabulary (we should train it also!)
-        tokenizer.add_special_tokens({'cls_token': '[CLS]'})
-        model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
-        print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary
+        >>> # Add a [CLS] to the vocabulary (we should train it also!)
+        >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})

-        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
-        encoded_choices = [tokenizer.encode(s) for s in choices]
-        cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
+        >>> embedding_layer = model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size

-        input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
-        mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1
+        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+        >>> encoded_choices = [tokenizer.encode(s) for s in choices]
+        >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]

-        outputs = model(input_ids, mc_token_ids=mc_token_ids)
-        lm_prediction_scores, mc_prediction_scores = outputs[:2]
+        >>> input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
+        >>> mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1
+
+        >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
+        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]

        """
        if "lm_labels" in kwargs:
--- a/src/transformers/modeling_longformer.py
+++ b/src/transformers/modeling_longformer.py
@@ -24,13 +24,15 @@ from torch.nn import CrossEntropyLoss, MSELoss
 from torch.nn import functional as F

 from .configuration_longformer import LongformerConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_bert import BertPreTrainedModel
 from .modeling_roberta import RobertaLMHead, RobertaModel


 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "LongformerTokenizer"
+
 LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "allenai/longformer-base-4096",
    "allenai/longformer-large-4096",
@@ -609,22 +611,22 @@ class LongformerModel(RobertaModel):

    Examples::

-        import torch
-        from transformers import LongformerModel, LongformerTokenizer
+        >>> import torch
+        >>> from transformers import LongformerModel, LongformerTokenizer

-        model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
-        tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
+        >>> model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
+        >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

-        SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
-        input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
+        >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
+        >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1

-        # Attention mask values -- 0: no attention, 1: local attention, 2: global attention
-        attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
-        attention_mask[:, [1, 4, 21,]] = 2  # Set global attention based on the task. For example,
-                                            # classification: the <s> token
-                                            # QA: question tokens
-                                            # LM: potentially on the beginning of sentences and paragraphs
-        sequence_output, pooled_output = model(input_ids, attention_mask=attention_mask)
+        >>> # Attention mask values -- 0: no attention, 1: local attention, 2: global attention
+        >>> attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
+        >>> attention_mask[:, [1, 4, 21,]] = 2  # Set global attention based on the task. For example,
+        ...                                     # classification: the <s> token
+        ...                                     # QA: question tokens
+        ...                                     # LM: potentially on the beginning of sentences and paragraphs
+        >>> sequence_output, pooled_output = model(input_ids, attention_mask=attention_mask)
        """

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -743,18 +745,18 @@ class LongformerForMaskedLM(BertPreTrainedModel):

    Examples::

-        import torch
-        from transformers import LongformerForMaskedLM, LongformerTokenizer
+        >>> import torch
+        >>> from transformers import LongformerForMaskedLM, LongformerTokenizer

-        model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096')
-        tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
+        >>> model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096')
+        >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

-        SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
-        input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
+        >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
+        >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1

-        attention_mask = None  # default is local attention everywhere, which is a good choice for MaskedLM
-                               # check ``LongformerModel.forward`` for more details how to set `attention_mask`
-        loss, prediction_scores = model(input_ids, attention_mask=attention_mask, labels=input_ids)
+        >>> attention_mask = None  # default is local attention everywhere, which is a good choice for MaskedLM
+        ...                        # check ``LongformerModel.forward`` for more details how to set `attention_mask`
+        >>> loss, prediction_scores = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        """

        if "masked_lm_labels" in kwargs:
@@ -807,6 +809,7 @@ class LongformerForSequenceClassification(BertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="allenai/longformer-base-4096")
    def forward(
        self,
        input_ids=None,
@@ -843,19 +846,6 @@ class LongformerForSequenceClassification(BertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import LongformerTokenizer, LongformerForSequenceClassification
-        import torch
-
-        tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
-        model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
        """

        if global_attention_mask is None:
@@ -973,25 +963,25 @@ class LongformerForQuestionAnswering(BertPreTrainedModel):

    Examples::

-        from transformers import LongformerTokenizer, LongformerForQuestionAnswering
-        import torch
+        >>> from transformers import LongformerTokenizer, LongformerForQuestionAnswering
+        >>> import torch

-        tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
-        model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
+        >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
+        >>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")

-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        encoding = tokenizer.encode_plus(question, text, return_tensors="pt")
-        input_ids = encoding["input_ids"]
+        >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+        >>> encoding = tokenizer.encode_plus(question, text, return_tensors="pt")
+        >>> input_ids = encoding["input_ids"]

-        # default is local attention everywhere
-        # the forward method will automatically set global attention on question tokens
-        attention_mask = encoding["attention_mask"]
+        >>> # default is local attention everywhere
+        >>> # the forward method will automatically set global attention on question tokens
+        >>> attention_mask = encoding["attention_mask"]

-        start_scores, end_scores = model(input_ids, attention_mask=attention_mask)
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
+        >>> start_scores, end_scores = model(input_ids, attention_mask=attention_mask)
+        >>> all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())

-        answer_tokens = all_tokens[torch.argmax(start_scores) :torch.argmax(end_scores)+1]
-        answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token
+        >>> answer_tokens = all_tokens[torch.argmax(start_scores) :torch.argmax(end_scores)+1]
+        >>> answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token

        """

@@ -1060,6 +1050,7 @@ class LongformerForTokenClassification(BertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="allenai/longformer-base-4096")
    def forward(
        self,
        input_ids=None,
@@ -1094,19 +1085,6 @@ class LongformerForTokenClassification(BertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import LongformerTokenizer, LongformerForTokenClassification
-        import torch
-
-        tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
-        model = LongformerForTokenClassification.from_pretrained('allenai/longformer-base-4096')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
        """

        outputs = self.longformer(
@@ -1163,6 +1141,7 @@ class LongformerForMultipleChoice(BertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="allenai/longformer-base-4096")
    def forward(
        self,
        input_ids=None,
@@ -1200,23 +1179,6 @@ class LongformerForMultipleChoice(BertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import LongformerTokenizer, LongformerForMultipleChoice
-        import torch
-
-        tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
-        model = LongformerForMultipleChoice.from_pretrained('allenai/longformer-base-4096')
-        # context = "The dog is cute" | choice = "the dog" / "the cat"
-        choices = [("The dog is cute", "the dog"), ("The dog is cute", "the cat")]
-        input_ids = torch.tensor([tokenizer.encode(s[0], s[1], add_special_tokens=True) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
-
-        # global attention is automatically put on "the dog" and "the cat"
-        outputs = model(input_ids, labels=labels)
-        loss, classification_scores = outputs[:2]
-
        """
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

--- a/src/transformers/modeling_marian.py
+++ b/src/transformers/modeling_marian.py
@@ -31,18 +31,18 @@ class MarianMTModel(BartForConditionalGeneration):

    Examples::

-        from transformers import MarianTokenizer, MarianMTModel
-        from typing import List
-        src = 'fr'  # source language
-        trg = 'en'  # target language
-        sample_text = "où est l'arrêt de bus ?"
-        mname = f'Helsinki-NLP/opus-mt-{src}-{trg}'
+        >>> from transformers import MarianTokenizer, MarianMTModel
+        >>> from typing import List
+        >>> src = 'fr'  # source language
+        >>> trg = 'en'  # target language
+        >>> sample_text = "où est l'arrêt de bus ?"
+        >>> mname = f'Helsinki-NLP/opus-mt-{src}-{trg}'

-        model = MarianMTModel.from_pretrained(mname)
-        tok = MarianTokenizer.from_pretrained(mname)
-        batch = tok.prepare_translation_batch(src_texts=[sample_text])  # don't need tgt_text for inference
-        gen = model.generate(**batch)  # for forward pass: model(**batch)
-        words: List[str] = tok.batch_decode(gen, skip_special_tokens=True)  # returns "Where is the the bus stop ?"
+        >>> model = MarianMTModel.from_pretrained(mname)
+        >>> tok = MarianTokenizer.from_pretrained(mname)
+        >>> batch = tok.prepare_translation_batch(src_texts=[sample_text])  # don't need tgt_text for inference
+        >>> gen = model.generate(**batch)  # for forward pass: model(**batch)
+        >>> words: List[str] = tok.batch_decode(gen, skip_special_tokens=True)  # returns "Where is the the bus stop ?"

    """

--- a/src/transformers/modeling_mobilebert.py
+++ b/src/transformers/modeling_mobilebert.py
@@ -34,11 +34,14 @@ from transformers.modeling_bert import BertIntermediate

 from .activations import gelu, gelu_new, swish
 from .configuration_mobilebert import MobileBertConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer


 logger = logging.getLogger(__name__)
+
+_TOKENIZER_FOR_DOC = "MobileBertTokenizer"
+
 MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = ["google/mobilebert-uncased"]


@@ -747,6 +750,7 @@ class MobileBertModel(MobileBertPreTrainedModel):
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def forward(
        self,
        input_ids=None,
@@ -785,20 +789,6 @@ class MobileBertModel(MobileBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import MobileBertModel, MobileBertTokenizer
-        import torch
-
-        tokenizer = MobileBertTokenizer.from_pretrained(model_name_or_path)
-        model = MobileBertModel.from_pretrained(model_name_or_path)
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -951,13 +941,17 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
            heads.

    Examples::
-        from transformers import MobileBertTokenizer, MobileBertForPreTraining
-        import torch
-        tokenizer = MobileBertTokenizer.from_pretrained(model_name_or_path)
-        model = MobileBertForPreTraining.from_pretrained(model_name_or_path)
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores, seq_relationship_scores = outputs[:2]
+
+        >>> from transformers import MobileBertTokenizer, MobileBertForPreTraining
+        >>> import torch
+
+        >>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")
+        >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased")
+
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+
+        >>> prediction_scores, seq_relationship_scores = outputs[:2]

        """
        outputs = self.mobilebert(
@@ -1022,6 +1016,7 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
            self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())

    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def forward(
        self,
        input_ids=None,
@@ -1063,20 +1058,6 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-        Examples::
-
-            from transformers import MobileBertTokenizer, MobileBertForMaskedLM
-            import torch
-
-            tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-            model = MobileBertForMaskedLM.from_pretrained('mobilebert-uncased')
-
-            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-            outputs = model(input_ids, labels=input_ids)
-
-            loss, prediction_scores = outputs[:2]
-
        """
        if "masked_lm_labels" in kwargs:
            warnings.warn(
@@ -1174,18 +1155,17 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel):

    Examples::

-        from transformers import MobileBertTokenizer, MobileBertForNextSentencePrediction
-        import torch
+        >>> from transformers import MobileBertTokenizer, MobileBertForNextSentencePrediction
+        >>> import torch

-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = MobileBertForNextSentencePrediction.from_pretrained('mobilebert-uncased')
+        >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
+        >>> model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')

-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='pt')
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')

-        loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
-        assert logits[0, 0] < logits[0, 1] # next sentence was random
+        >>> loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
        """

        outputs = self.mobilebert(
@@ -1228,6 +1208,7 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def forward(
        self,
        input_ids=None,
@@ -1263,20 +1244,6 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import BertTokenizer, BertForSequenceClassification
-        import torch
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-
-        loss, logits = outputs[:2]
        """

        outputs = self.mobilebert(
@@ -1321,6 +1288,7 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def forward(
        self,
        input_ids=None,
@@ -1363,25 +1331,6 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import MobileBertTokenizer, MobileBertForQuestionAnswering
-        import torch
-
-        tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
-        model = MobileBertForQuestionAnswering.from_pretrained(model_name_or_path)
-
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        encoding = tokenizer.encode_plus(question, text)
-        input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"]
-        start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-        answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
-
-        assert answer == "a nice puppet"
-
        """

        outputs = self.mobilebert(
@@ -1439,6 +1388,7 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def forward(
        self,
        input_ids=None,
@@ -1476,25 +1426,6 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import MobileBertTokenizer, MobileBertForMultipleChoice
-        import torch
-
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = MobileBertForMultipleChoice.from_pretrained('mobilebert-uncased')
-
-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        choice0 = "It is eaten with a fork and a knife."
-        choice1 = "It is eaten while held in the hand."
-        labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
-
-        encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
-        outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
-
-        # the linear classifier still needs to be trained
-        loss, logits = outputs[:2]
        """
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

@@ -1552,6 +1483,7 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def forward(
        self,
        input_ids=None,
@@ -1586,21 +1518,6 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import MobileBertTokenizer, MobileBertForTokenClassification
-        import torch
-
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = MobileBertForTokenClassification.from_pretrained('mobilebert-uncased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-
-        loss, scores = outputs[:2]
-
        """

        outputs = self.mobilebert(
--- a/src/transformers/modeling_openai.py
+++ b/src/transformers/modeling_openai.py
@@ -28,7 +28,7 @@ from torch.nn import CrossEntropyLoss

 from .activations import gelu_new, swish
 from .configuration_openai import OpenAIGPTConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import (
    Conv1D,
    PreTrainedModel,
@@ -40,6 +40,8 @@ from .modeling_utils import (

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "OpenAIGPTTokenizer"
+
 OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "openai-gpt",
    # See all OpenAI GPT models at https://huggingface.co/models?filter=openai-gpt
@@ -356,6 +358,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
            self.h[layer].attn.prune_heads(heads)

    @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
    def forward(
        self,
        input_ids=None,
@@ -383,18 +386,6 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import OpenAIGPTTokenizer, OpenAIGPTModel
-        import torch
-
-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = OpenAIGPTModel.from_pretrained('openai-gpt')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
@@ -490,6 +481,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
        return self.lm_head

    @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
    def forward(
        self,
        input_ids=None,
@@ -531,18 +523,6 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel
-        import torch
-
-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, logits = outputs[:2]
-
    """
        transformer_outputs = self.transformer(
            input_ids,
--- a/src/transformers/modeling_reformer.py
+++ b/src/transformers/modeling_reformer.py
@@ -29,12 +29,20 @@ from torch.nn import CrossEntropyLoss

 from .activations import gelu, gelu_fast, gelu_new, swish
 from .configuration_reformer import ReformerConfig
-from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    DUMMY_INPUTS,
+    DUMMY_MASK,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_utils import PreTrainedModel, apply_chunking_to_forward


 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "ReformerTokenizer"
+
 REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "google/reformer-crime-and-punishment",
    "google/reformer-enwik8",
@@ -1543,6 +1551,7 @@ class ReformerModel(ReformerPreTrainedModel):
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/reformer-crime-and-punishment")
    def forward(
        self,
        input_ids=None,
@@ -1570,19 +1579,6 @@ class ReformerModel(ReformerPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import ReformerModel, ReformerTokenizer
-        import torch
-
-        tokenizer = ReformerTokenizer.from_pretrained('google/reformer-crime-and-punishment')
-        model =  ReformerModel.from_pretrained('google/reformer-crime-and-punishment')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -1738,6 +1734,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
        pass

    @add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/reformer-crime-and-punishment")
    def forward(
        self,
        input_ids=None,
@@ -1774,19 +1771,6 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import ReformerModelWithLMHead, ReformerTokenizer
-        import torch
-
-        tokenizer = ReformerTokenizer.from_pretrained('google/reformer-crime-and-punishment')
-        model =  ReformerModelWithLMHead.from_pretrained('google/reformer-crime-and-punishment')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-
-        loss, prediction_scores = outputs[:2]
        """

        reformer_outputs = self.reformer(
--- a/src/transformers/modeling_roberta.py
+++ b/src/transformers/modeling_roberta.py
@@ -24,12 +24,14 @@ import torch.nn as nn
 from torch.nn import CrossEntropyLoss, MSELoss

 from .configuration_roberta import RobertaConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu


 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "RobertaTokenizer"
+
 ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "roberta-base",
    "roberta-large",
@@ -177,6 +179,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
        return self.lm_head.decoder

    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def forward(
        self,
        input_ids=None,
@@ -216,18 +219,6 @@ class RobertaForMaskedLM(BertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import RobertaTokenizer, RobertaForMaskedLM
-        import torch
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = RobertaForMaskedLM.from_pretrained('roberta-base')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, prediction_scores = outputs[:2]
-
        """
        if "masked_lm_labels" in kwargs:
            warnings.warn(
@@ -304,6 +295,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def forward(
        self,
        input_ids=None,
@@ -340,19 +332,6 @@ class RobertaForSequenceClassification(BertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import RobertaTokenizer, RobertaForSequenceClassification
-        import torch
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = RobertaForSequenceClassification.from_pretrained('roberta-base')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
        """
        outputs = self.roberta(
            input_ids,
@@ -400,6 +379,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def forward(
        self,
        input_ids=None,
@@ -437,20 +417,6 @@ class RobertaForMultipleChoice(BertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import RobertaTokenizer, RobertaForMultipleChoice
-        import torch
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = RobertaForMultipleChoice.from_pretrained('roberta-base')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-        input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, classification_scores = outputs[:2]
-
        """
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

@@ -510,6 +476,7 @@ class RobertaForTokenClassification(BertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def forward(
        self,
        input_ids=None,
@@ -544,19 +511,6 @@ class RobertaForTokenClassification(BertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import RobertaTokenizer, RobertaForTokenClassification
-        import torch
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = RobertaForTokenClassification.from_pretrained('roberta-base')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
        """

        outputs = self.roberta(
@@ -632,6 +586,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def forward(
        self,
        input_ids=None,
@@ -674,25 +629,6 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        # The checkpoint roberta-large is not fine-tuned for question answering. Please see the
-        # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
-
-        from transformers import RobertaTokenizer, RobertaForQuestionAnswering
-        import torch
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = RobertaForQuestionAnswering.from_pretrained('roberta-base')
-
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_ids = tokenizer.encode(question, text)
-        start_scores, end_scores = model(torch.tensor([input_ids]))
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-        answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
-
        """

        outputs = self.roberta(
--- a/src/transformers/modeling_t5.py
+++ b/src/transformers/modeling_t5.py
@@ -33,6 +33,8 @@ from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, p

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "T5Tokenizer"
+
 ####################################################
 # This dict contrains shortcut names and associated url
 # for the pretrained weights provided with the models
@@ -924,16 +926,17 @@ class T5Model(T5PreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

-    Examples::
+        Example::

-        from transformers import T5Tokenizer, T5Model
+            >>> from transformers import T5Tokenizer, T5Model

-        tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        model = T5Model.from_pretrained('t5-small')
-        input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
-        outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+            >>> model = T5Model.from_pretrained('t5-small')

+            >>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
+            >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
+
+            >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """
        use_cache = use_cache if use_cache is not None else self.config.use_cache

@@ -1068,18 +1071,18 @@ class T5ForConditionalGeneration(T5PreTrainedModel):

    Examples::

-        from transformers import T5Tokenizer, T5ForConditionalGeneration
+        >>> from transformers import T5Tokenizer, T5ForConditionalGeneration

-        tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        model = T5ForConditionalGeneration.from_pretrained('t5-small')
-        input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
-        outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)
-        loss, prediction_scores = outputs[:2]
+        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        >>> model = T5ForConditionalGeneration.from_pretrained('t5-small')
+        >>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)
+        >>> loss, prediction_scores = outputs[:2]

-        tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        model = T5ForConditionalGeneration.from_pretrained('t5-small')
-        input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt")  # Batch size 1
-        outputs = model.generate(input_ids)
+        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        >>> model = T5ForConditionalGeneration.from_pretrained('t5-small')
+        >>> input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt")  # Batch size 1
+        >>> outputs = model.generate(input_ids)
        """

        if "lm_labels" in kwargs:
--- a/src/transformers/modeling_tf_albert.py
+++ b/src/transformers/modeling_tf_albert.py
@@ -21,7 +21,12 @@ import logging
 import tensorflow as tf

 from .configuration_albert import AlbertConfig
-from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_tf_bert import ACT2FN, TFBertSelfAttention
 from .modeling_tf_utils import (
    TFMultipleChoiceLoss,
@@ -39,6 +44,8 @@ from .tokenization_utils import BatchEncoding

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "AlbertTokenizer"
+
 TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "albert-base-v1",
    "albert-large-v1",
@@ -713,6 +720,7 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
        self.albert = TFAlbertMainLayer(config, name="albert")

    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
    def call(self, inputs, **kwargs):
        r"""
    Returns:
@@ -737,18 +745,6 @@ class TFAlbertModel(TFAlbertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import AlbertTokenizer, TFAlbertModel
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = TFAlbertModel.from_pretrained('albert-base-v2')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        outputs = self.albert(inputs, **kwargs)
        return outputs
@@ -837,6 +833,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
        return self.albert.embeddings

    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
    def call(self, inputs, **kwargs):
        r"""
    Returns:
@@ -854,18 +851,6 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import AlbertTokenizer, TFAlbertForMaskedLM
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = TFAlbertForMaskedLM.from_pretrained('albert-base-v2')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores = outputs[0]
-
        """
        outputs = self.albert(inputs, **kwargs)

@@ -895,6 +880,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
        )

    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
    def call(
        self,
        inputs=None,
@@ -930,19 +916,6 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import AlbertTokenizer, TFAlbertForSequenceClassification
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[8] if len(inputs) > 8 else labels
@@ -994,6 +967,7 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
        )

    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
    def call(
        self,
        inputs=None,
@@ -1027,19 +1001,6 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import AlbertTokenizer, TFAlbertForTokenClassification
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = TFAlbertForTokenClassification.from_pretrained('albert-base-v2')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[8] if len(inputs) > 8 else labels
@@ -1089,6 +1050,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
        )

    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
    def call(
        self,
        inputs=None,
@@ -1130,24 +1092,6 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        # The checkpoint albert-base-v2 is not fine-tuned for question answering. Please see the
-        # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
-
-        import tensorflow as tf
-        from transformers import AlbertTokenizer, TFAlbertForQuestionAnswering
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = TFAlbertForQuestionAnswering.from_pretrained('albert-base-v2')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
-        start_scores, end_scores = model(input_dict)
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
-
        """
        if isinstance(inputs, (tuple, list)):
            start_positions = inputs[8] if len(inputs) > 8 else start_positions
@@ -1213,6 +1157,7 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}

    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
    def call(
        self,
        inputs,
@@ -1249,22 +1194,6 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import AlbertTokenizer, TFAlbertForMultipleChoice
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = TFAlbertForMultipleChoice.from_pretrained('albert-base-v2')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-
-        input_ids = tokenizer(choices, add_special_tokens=True, return_tensors='tf', truncation=True, padding=True)[None, :] # Batch size 1, 2 choices
-        labels = tf.reshape(tf.constant(1), (-1, 1))
-        outputs = model(input_ids, labels=labels)
-
-        loss, classification_scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
--- a/src/transformers/modeling_tf_bert.py
+++ b/src/transformers/modeling_tf_bert.py
@@ -22,7 +22,12 @@ import numpy as np
 import tensorflow as tf

 from .configuration_bert import BertConfig
-from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_tf_utils import (
    TFMultipleChoiceLoss,
    TFPreTrainedModel,
@@ -39,6 +44,7 @@ from .tokenization_utils import BatchEncoding

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "BertTokenizer"

 TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "bert-base-uncased",
@@ -704,6 +710,7 @@ class TFBertModel(TFBertPreTrainedModel):
        self.bert = TFBertMainLayer(config, name="bert")

    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
    def call(self, inputs, **kwargs):
        r"""
    Returns:
@@ -728,18 +735,6 @@ class TFBertModel(TFBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertModel
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertModel.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """
        outputs = self.bert(inputs, **kwargs)
        return outputs
@@ -819,6 +814,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
        return self.bert.embeddings

    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
    def call(self, inputs, **kwargs):
        r"""
    Return:
@@ -836,18 +832,6 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertForMaskedLM
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores = outputs[0]
-
        """
        outputs = self.bert(inputs, **kwargs)

@@ -930,6 +914,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
        )

    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
    def call(
        self,
        inputs=None,
@@ -965,19 +950,6 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertForSequenceClassification
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[8] if len(inputs) > 8 else labels
@@ -1037,6 +1009,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}

    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
    def call(
        self,
        inputs,
@@ -1073,22 +1046,6 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertForMultipleChoice
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForMultipleChoice.from_pretrained('bert-base-uncased')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-
-        input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
-        labels = tf.reshape(tf.constant(1), (-1, 1))
-        outputs = model(input_ids, labels=labels)
-
-        loss, classification_scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
@@ -1177,6 +1134,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
        )

    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
    def call(
        self,
        inputs=None,
@@ -1210,19 +1168,6 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertForTokenClassification
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForTokenClassification.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[8] if len(inputs) > 8 else labels
@@ -1273,6 +1218,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)
        )

    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
    def call(
        self,
        inputs=None,
@@ -1314,22 +1260,6 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertForQuestionAnswering
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
-        start_scores, end_scores = model(input_dict)
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
-        assert answer == "a nice puppet"
-
        """
        if isinstance(inputs, (tuple, list)):
            start_positions = inputs[8] if len(inputs) > 8 else start_positions
--- a/src/transformers/modeling_tf_ctrl.py
+++ b/src/transformers/modeling_tf_ctrl.py
@@ -22,7 +22,7 @@ import numpy as np
 import tensorflow as tf

 from .configuration_ctrl import CTRLConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_tf_utils import (
    TFPreTrainedModel,
    TFSharedEmbeddings,
@@ -35,6 +35,8 @@ from .tokenization_utils import BatchEncoding

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "CtrlTokenizer"
+
 TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "ctrl"
    # See all CTRL models at https://huggingface.co/models?filter=ctrl
@@ -489,6 +491,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
        self.transformer = TFCTRLMainLayer(config, name="transformer")

    @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
    def call(self, inputs, **kwargs):
        r"""
    Return:
@@ -510,18 +513,6 @@ class TFCTRLModel(TFCTRLPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import CTRLTokenizer, TFCTRLModel
-
-        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
-        model = TFCTRLModel.from_pretrained('ctrl')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        outputs = self.transformer(inputs, **kwargs)
        return outputs
@@ -569,6 +560,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
        return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]}

    @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
    def call(self, inputs, **kwargs):
        r"""
    Return:
@@ -590,19 +582,6 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import CTRLTokenizer, TFCTRLLMHeadModel
-
-        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
-        model = TFCTRLLMHeadModel.from_pretrained('ctrl')
-
-        input_ids = tf.constant([tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)])
-        outputs = model(input_ids)
-        loss, logits = outputs[:2]
-
        """
        transformer_outputs = self.transformer(inputs, **kwargs)
        hidden_states = transformer_outputs[0]
--- a/src/transformers/modeling_tf_distilbert.py
+++ b/src/transformers/modeling_tf_distilbert.py
@@ -23,7 +23,12 @@ import numpy as np
 import tensorflow as tf

 from .configuration_distilbert import DistilBertConfig
-from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_tf_utils import (
    TFMultipleChoiceLoss,
    TFPreTrainedModel,
@@ -41,6 +46,7 @@ from .tokenization_utils import BatchEncoding

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "DistilBertTokenizer"

 TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "distilbert-base-uncased",
@@ -575,6 +581,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")  # Embeddings

    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
    def call(self, inputs, **kwargs):
        r"""
    Returns:
@@ -592,17 +599,6 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import DistilBertTokenizer, TFDistilBertModel
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = TFDistilBertModel.from_pretrained('distilbert-base-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """
        outputs = self.distilbert(inputs, **kwargs)
        return outputs
@@ -647,6 +643,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
        return self.vocab_projector.input_embeddings

    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
    def call(self, inputs, **kwargs):
        r"""

@@ -665,18 +662,6 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import DistilBertTokenizer, TFDistilBertForMaskedLM
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores = outputs[0]
-
        """
        distilbert_output = self.distilbert(inputs, **kwargs)

@@ -713,6 +698,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque
        self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout)

    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
    def call(
        self,
        inputs=None,
@@ -746,19 +732,6 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[6] if len(inputs) > 6 else labels
@@ -809,6 +782,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
        )

    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
    def call(
        self,
        inputs=None,
@@ -840,19 +814,6 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import DistilBertTokenizer, TFDistilBertForTokenClassification
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[6] if len(inputs) > 6 else labels
@@ -916,6 +877,7 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}

    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
    def call(
        self,
        inputs,
@@ -950,22 +912,6 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import DistilBertTokenizer, TFDistilBertForMultipleChoice
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-        model = TFDistilBertForMultipleChoice.from_pretrained('distilbert-base-uncased')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-
-        input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
-        labels = tf.reshape(tf.constant(1), (-1, 1))
-        outputs = model(input_ids, labels=labels)
-
-        loss, classification_scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
@@ -1046,6 +992,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn
        self.dropout = tf.keras.layers.Dropout(config.qa_dropout)

    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
    def call(
        self,
        inputs=None,
@@ -1085,21 +1032,6 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
-        start_scores, end_scores = model(input_dict)
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
-
        """
        if isinstance(inputs, (tuple, list)):
            start_positions = inputs[6] if len(inputs) > 6 else start_positions
--- a/src/transformers/modeling_tf_electra.py
+++ b/src/transformers/modeling_tf_electra.py
@@ -4,7 +4,7 @@ import tensorflow as tf

 from transformers import ElectraConfig

-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel
 from .modeling_tf_utils import (
    TFQuestionAnsweringLoss,
@@ -18,6 +18,7 @@ from .tokenization_utils import BatchEncoding

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "ElectraTokenizer"

 TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "google/electra-small-generator",
@@ -383,6 +384,7 @@ class TFElectraModel(TFElectraPreTrainedModel):
        self.electra = TFElectraMainLayer(config, name="electra")

    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
    def call(self, inputs, **kwargs):
        r"""
    Returns:
@@ -400,17 +402,6 @@ class TFElectraModel(TFElectraPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import ElectraTokenizer, TFElectraModel
-
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
-        model = TFElectraModel.from_pretrained('google/electra-small-discriminator')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """
        outputs = self.electra(inputs, **kwargs)
        return outputs
@@ -532,6 +523,7 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel):
        return self.generator_lm_head

    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-generator")
    def call(
        self,
        input_ids=None,
@@ -560,18 +552,6 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import ElectraTokenizer, TFElectraForMaskedLM
-
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
-        model = TFElectraForMaskedLM.from_pretrained('google/electra-small-generator')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores = outputs[0]
-
        """

        generator_hidden_states = self.electra(
@@ -611,6 +591,7 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific
        )

    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
    def call(
        self,
        inputs=None,
@@ -644,19 +625,6 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import ElectraTokenizer, TFElectraForTokenClassification
-
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
-        model = TFElectraForTokenClassification.from_pretrained('google/electra-small-discriminator')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[8] if len(inputs) > 8 else labels
@@ -705,6 +673,7 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin
        )

    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
    def call(
        self,
        inputs=None,
@@ -746,22 +715,6 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import ElectraTokenizer, TFElectraForQuestionAnswering
-
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
-        model = TFElectraForQuestionAnswering.from_pretrained('google/electra-small-generator')
-
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
-        start_scores, end_scores = model(input_dict)
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
-
        """
        if isinstance(inputs, (tuple, list)):
            start_positions = inputs[8] if len(inputs) > 8 else start_positions
--- a/src/transformers/modeling_tf_gpt2.py
+++ b/src/transformers/modeling_tf_gpt2.py
@@ -22,7 +22,7 @@ import numpy as np
 import tensorflow as tf

 from .configuration_gpt2 import GPT2Config
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_tf_utils import (
    TFConv1D,
    TFPreTrainedModel,
@@ -38,6 +38,8 @@ from .tokenization_utils import BatchEncoding

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
+
 TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "gpt2",
    "gpt2-medium",
@@ -490,6 +492,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
        self.transformer = TFGPT2MainLayer(config, name="transformer")

    @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
    def call(self, inputs, **kwargs):
        r"""
    Return:
@@ -511,18 +514,6 @@ class TFGPT2Model(TFGPT2PreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import GPT2Tokenizer, TFGPT2Model
-
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = TFGPT2Model.from_pretrained('gpt2')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
    """
        outputs = self.transformer(inputs, **kwargs)
        return outputs
@@ -549,6 +540,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
        return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]}

    @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
    def call(self, inputs, **kwargs):
        r"""
    Return:
@@ -570,19 +562,6 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
-
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = TFGPT2LMHeadModel.from_pretrained('gpt2')
-
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        logits = outputs[0]
-
        """
        transformer_outputs = self.transformer(inputs, **kwargs)
        hidden_states = transformer_outputs[0]
@@ -659,29 +638,26 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):

    Examples::

-        # For example purposes. Not runnable.
-        import tensorflow as tf
-        from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
+        >>> import tensorflow as tf
+        >>> from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel

-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')
+        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        >>> model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')

-        # Add a [CLS] to the vocabulary (we should train it also!)
-        # This option is currently not implemented in TF 2.0
-        raise NotImplementedError
-        tokenizer.add_special_tokens({'cls_token': '[CLS]'})
-        model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
-        print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary
+        >>> # Add a [CLS] to the vocabulary (we should train it also!)
+        >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})

-        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
-        encoded_choices = [tokenizer.encode(s) for s in choices]
-        cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
+        >>> embedding_layer = model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size

-        input_ids = tf.constant(encoded_choices)[None, :]  # Batch size: 1, number of choices: 2
-        mc_token_ids = tf.constant([cls_token_location])  # Batch size: 1
+        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+        >>> encoded_choices = [tokenizer.encode(s) for s in choices]
+        >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]

-        outputs = model(input_ids, mc_token_ids=mc_token_ids)
-        lm_prediction_scores, mc_prediction_scores = outputs[:2]
+        >>> input_ids = tf.constant(encoded_choices)[None, :]  # Batch size: 1, number of choices: 2
+        >>> mc_token_ids = tf.constant([cls_token_location])  # Batch size: 1
+
+        >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
+        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]

        """
        if isinstance(inputs, (tuple, list)):
--- a/src/transformers/modeling_tf_mobilebert.py
+++ b/src/transformers/modeling_tf_mobilebert.py
@@ -21,7 +21,12 @@ import logging
 import tensorflow as tf

 from . import MobileBertConfig
-from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_tf_bert import TFBertIntermediate, gelu, gelu_new, swish
 from .modeling_tf_utils import (
    TFMultipleChoiceLoss,
@@ -39,6 +44,7 @@ from .tokenization_utils import BatchEncoding

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "MobileBertTokenizer"

 TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "mobilebert-uncased",
@@ -621,19 +627,6 @@ class TFMobileBertMLMHead(tf.keras.layers.Layer):
        return prediction_scores


-class TFMobileBertPreTrainingHeads(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.predictions = TFMobileBertLMPredictionHead(config, name="predictions")
-        self.seq_relationship = tf.keras.layers.Dense(2, name="seq_relationship")
-
-    def call(self, inputs):
-        sequence_output, pooled_output = inputs
-        prediction_scores = self.predictions(sequence_output)
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return prediction_scores, seq_relationship_score
-
-
@keras_serializable
 class TFMobileBertMainLayer(tf.keras.layers.Layer):
    config_class = MobileBertConfig
@@ -845,6 +838,7 @@ class TFMobileBertModel(TFMobileBertPreTrainedModel):
        self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")

    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def call(self, inputs, **kwargs):
        r"""
    Returns:
@@ -869,18 +863,6 @@ class TFMobileBertModel(TFMobileBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import MobileBertTokenizer, TFMobileBertModel
-
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = TFMobileBertModel.from_pretrained('mobilebert-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """
        outputs = self.mobilebert(inputs, **kwargs)
        return outputs
@@ -895,7 +877,8 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")
-        self.cls = TFMobileBertPreTrainingHeads(config, name="cls")
+        self.predictions = TFMobileBertMLMHead(config, name="predictions___cls")
+        self.seq_relationship = TFMobileBertOnlyNSPHead(2, name="seq_relationship___cls")

    def get_output_embeddings(self):
        return self.mobilebert.embeddings
@@ -923,20 +906,21 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel):

    Examples::

-        import tensorflow as tf
-        from transformers import MobileBertTokenizer, TFMobileBertForPreTraining
+        >>> import tensorflow as tf
+        >>> from transformers import MobileBertTokenizer, TFMobileBertForPreTraining

-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = TFMobileBertForPreTraining.from_pretrained('mobilebert-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores, seq_relationship_scores = outputs[:2]
+        >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
+        >>> model = TFMobileBertForPreTraining.from_pretrained('google/mobilebert-uncased')
+        >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> prediction_scores, seq_relationship_scores = outputs[:2]

        """
        outputs = self.mobilebert(inputs, **kwargs)

        sequence_output, pooled_output = outputs[:2]
-        prediction_scores, seq_relationship_score = self.cls([sequence_output, pooled_output])
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
        outputs = (prediction_scores, seq_relationship_score,) + outputs[
            2:
        ]  # add hidden states and attention if they are here
@@ -956,6 +940,7 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel):
        return self.mobilebert.embeddings

    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def call(self, inputs, **kwargs):
        r"""
    Return:
@@ -973,18 +958,6 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import MobileBertTokenizer, TFMobileBertForMaskedLM
-
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = TFMobileBertForMaskedLM.from_pretrained('mobilebert-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores = outputs[0]
-
        """
        outputs = self.mobilebert(inputs, **kwargs)

@@ -1015,7 +988,7 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel):
        super().__init__(config, *inputs, **kwargs)

        self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")
-        self.cls = TFMobileBertOnlyNSPHead(config, name="cls")
+        self.cls = TFMobileBertOnlyNSPHead(config, name="seq_relationship___cls")

    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
    def call(self, inputs, **kwargs):
@@ -1038,18 +1011,17 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel):

    Examples::

-        import tensorflow as tf
-        from transformers import MobileBertTokenizer, TFMobileBertForNextSentencePrediction
+        >>> import tensorflow as tf
+        >>> from transformers import MobileBertTokenizer, TFMobileBertForNextSentencePrediction

-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = TFMobileBertForNextSentencePrediction.from_pretrained('mobilebert-uncased')
+        >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
+        >>> model = TFMobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')

-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='tf')
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='tf')

-        logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
-        assert logits[0][0] < logits[0][1] # the next sentence was random
+        >>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
        """
        outputs = self.mobilebert(inputs, **kwargs)

@@ -1078,6 +1050,7 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
        )

    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def call(
        self,
        inputs=None,
@@ -1113,19 +1086,6 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import MobileBertTokenizer, TFBMobileBertForSequenceClassification
-
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = TFMobileBertForSequenceClassification.from_pretrained('mobilebert-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[8] if len(inputs) > 8 else labels
@@ -1176,6 +1136,7 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn
        )

    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def call(
        self,
        inputs=None,
@@ -1217,22 +1178,6 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import MobileBertTokenizer, TFMobileBertForQuestionAnswering
-
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = TFMobileBertForQuestionAnswering.from_pretrained('mobilebert-uncased')  # Not a fine-tuned model! Load a fine-tuned model to obtain coherent results.
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
-        start_scores, end_scores = model(input_dict)
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
-        assert answer == "a nice puppet"
-
        """
        if isinstance(inputs, (tuple, list)):
            start_positions = inputs[8] if len(inputs) > 8 else start_positions
@@ -1298,6 +1243,7 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}

    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def call(
        self,
        inputs,
@@ -1334,22 +1280,6 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import MobileBertTokenizer, TFMobileBertForMultipleChoice
-
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = TFMobileBertForMultipleChoice.from_pretrained('mobilebert-uncased')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-
-        input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
-        labels = tf.reshape(tf.constant(1), (-1, 1))
-        outputs = model(input_ids, labels=labels)
-
-        loss, classification_scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
@@ -1438,6 +1368,7 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
        )

    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def call(
        self,
        inputs=None,
@@ -1471,19 +1402,6 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import MobileBertTokenizer, TFMobileBertForTokenClassification
-
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = TFMobileBertForTokenClassification.from_pretrained('mobilebert-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[8] if len(inputs) > 8 else labels
--- a/src/transformers/modeling_tf_openai.py
+++ b/src/transformers/modeling_tf_openai.py
@@ -22,7 +22,7 @@ import numpy as np
 import tensorflow as tf

 from .configuration_openai import OpenAIGPTConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_tf_utils import (
    TFConv1D,
    TFPreTrainedModel,
@@ -38,6 +38,8 @@ from .tokenization_utils import BatchEncoding

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "OpenAIGPTTokenizer"
+
 TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "openai-gpt",
    # See all OpenAI GPT models at https://huggingface.co/models?filter=openai-gpt
@@ -449,6 +451,7 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
        self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")

    @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
    def call(self, inputs, **kwargs):
        r"""
    Return:
@@ -466,18 +469,6 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import OpenAIGPTTokenizer, TFOpenAIGPTModel
-
-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = TFOpenAIGPTModel.from_pretrained('openai-gpt')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        outputs = self.transformer(inputs, **kwargs)
        return outputs
@@ -497,6 +488,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
        return self.transformer.tokens_embed

    @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
    def call(self, inputs, **kwargs):
        r"""
    Return:
@@ -514,18 +506,6 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import OpenAIGPTTokenizer, TFOpenAIGPTLMHeadModel
-
-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = TFOpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        logits = outputs[0]
-
        """
        transformer_outputs = self.transformer(inputs, **kwargs)
        hidden_states = transformer_outputs[0]
@@ -601,26 +581,23 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):

    Examples::

-        # For example purposes. Not runnable.
-        import tensorflow as tf
-        from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
+        >>> import tensorflow as tf
+        >>> from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel

-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
+        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        >>> model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')

-        # Add a [CLS] to the vocabulary (we should train it also!)
-        # This option is currently not implemented in TF 2.0
-        raise NotImplementedError
-        tokenizer.add_special_tokens({'cls_token': '[CLS]'})
-        model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
-        print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary
-
-        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
-        input_ids = tf.constant([tokenizer.encode(s) for s in choices])[None, :]  # Batch size 1, 2 choices
-        mc_token_ids = tf.constant([input_ids.size(-1), input_ids.size(-1)])[None, :]  # Batch size 1
-        outputs = model(input_ids, mc_token_ids=mc_token_ids)
-        lm_prediction_scores, mc_prediction_scores = outputs[:2]
+        >>> # Add a [CLS] to the vocabulary (we should train it also!)
+        >>> tokenizer.add_special_tokens({'cls_token': '[CLS]'})
+        >>> model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
+        >>> print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary

+        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+        >>> encoding = tokenizer(choices, return_tensors="tf")
+        >>> inputs = {k: tf.expand_dims(v, 0) for k, v in encoding.items()}
+        >>> inputs["mc_token_ids"]= tf.constant([inputs["input_ids"].shape[-1] - 1, inputs["input_ids"].shape[-1] - 1])[None, :]  # Batch size 1
+        >>> outputs = model(inputs)
+        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
        """

        if isinstance(inputs, (tuple, list)):
@@ -633,7 +610,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
            mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids
            output_attentions = inputs[7] if len(inputs) > 7 else output_attentions
            assert len(inputs) <= 8, "Too many inputs."
-        elif isinstance(inputs, dict):
+        elif isinstance(inputs, (dict, BatchEncoding)):
            input_ids = inputs.get("input_ids")
            attention_mask = inputs.get("attention_mask", attention_mask)
            token_type_ids = inputs.get("token_type_ids", token_type_ids)
--- a/src/transformers/modeling_tf_roberta.py
+++ b/src/transformers/modeling_tf_roberta.py
@@ -21,7 +21,12 @@ import logging
 import tensorflow as tf

 from .configuration_roberta import RobertaConfig
-from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu
 from .modeling_tf_utils import (
    TFMultipleChoiceLoss,
@@ -38,6 +43,8 @@ from .tokenization_utils_base import BatchEncoding

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "RobertaTokenizer"
+
 TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "roberta-base",
    "roberta-large",
@@ -195,6 +202,7 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
        self.roberta = TFRobertaMainLayer(config, name="roberta")

    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def call(self, inputs, **kwargs):
        r"""
    Returns:
@@ -219,18 +227,6 @@ class TFRobertaModel(TFRobertaPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import RobertaTokenizer, TFRobertaModel
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = TFRobertaModel.from_pretrained('roberta-base')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        outputs = self.roberta(inputs, **kwargs)
        return outputs
@@ -279,6 +275,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
        return self.lm_head.decoder

    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def call(self, inputs, **kwargs):
        r"""
    Return:
@@ -296,18 +293,6 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import RobertaTokenizer, TFRobertaForMaskedLM
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores = outputs[0]
-
        """
        outputs = self.roberta(inputs, **kwargs)

@@ -358,6 +343,7 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla
        self.classifier = TFRobertaClassificationHead(config, name="classifier")

    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def call(
        self,
        inputs=None,
@@ -387,19 +373,6 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[8] if len(inputs) > 8 else labels
@@ -441,7 +414,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

-        self.roberta = TFBertMainLayer(config, name="roberta")
+        self.roberta = TFRobertaMainLayer(config, name="roberta")
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
        self.classifier = tf.keras.layers.Dense(
            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
@@ -457,6 +430,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}

    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def call(
        self,
        inputs,
@@ -493,22 +467,6 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import RobertaTokenizer, TFRobertaForMultipleChoice
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = TFRobertaForMultipleChoice.from_pretrained('roberta-base')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-
-        input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
-        labels = tf.reshape(tf.constant(1), (-1, 1))
-        outputs = model(input_ids, labels=labels)
-
-        loss, classification_scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
@@ -592,6 +550,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
        )

    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def call(
        self,
        inputs=None,
@@ -625,19 +584,6 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import RobertaTokenizer, TFRobertaForTokenClassification
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = TFRobertaForTokenClassification.from_pretrained('roberta-base')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[8] if len(inputs) > 8 else labels
@@ -687,6 +633,7 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
        )

    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def call(
        self,
        inputs=None,
@@ -728,24 +675,6 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        # The checkpoint roberta-base is not fine-tuned for question answering. Please see the
-        # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
-
-        import tensorflow as tf
-        from transformers import RobertaTokenizer, TFRobertaForQuestionAnswering
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = TFRobertaForQuestionAnswering.from_pretrained('roberta-base')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
-        start_scores, end_scores = model(input_dict)
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
-
        """
        if isinstance(inputs, (tuple, list)):
            start_positions = inputs[8] if len(inputs) > 8 else start_positions
--- a/src/transformers/modeling_tf_t5.py
+++ b/src/transformers/modeling_tf_t5.py
@@ -37,6 +37,8 @@ from .tokenization_utils import BatchEncoding

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "T5Tokenizer"
+
 TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "t5-small",
    "t5-base",
@@ -931,13 +933,13 @@ class TFT5Model(TFT5PreTrainedModel):

    Examples::

-        from transformers import T5Tokenizer, TFT5Model
+        >>> from transformers import T5Tokenizer, TFT5Model

-        tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        model = TFT5Model.from_pretrained('t5-small')
-        inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf")  # Batch size 1
-        outputs = model(inputs, decoder_input_ids=inputs)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        >>> model = TFT5Model.from_pretrained('t5-small')
+        >>> inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf")  # Batch size 1
+        >>> outputs = model(inputs, decoder_input_ids=inputs)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

        """

@@ -1074,18 +1076,18 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel):

    Examples::

-        from transformers import T5Tokenizer, TFT5ForConditionalGeneration
+        >>> from transformers import T5Tokenizer, TFT5ForConditionalGeneration

-        tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
-        inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf")  # Batch size 1
-        outputs = model(inputs, decoder_input_ids=inputs)
-        prediction_scores = outputs[0]
+        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        >>> model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
+        >>> inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf")  # Batch size 1
+        >>> outputs = model(inputs, decoder_input_ids=inputs)
+        >>> prediction_scores = outputs[0]

-        tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
-        inputs = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="tf")  # Batch size 1
-        model.generate(inputs)
+        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        >>> model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
+        >>> inputs = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="tf")  # Batch size 1
+        >>> result = model.generate(inputs)

        """

--- a/src/transformers/modeling_tf_transfo_xl.py
+++ b/src/transformers/modeling_tf_transfo_xl.py
@@ -22,7 +22,7 @@ import logging
 import tensorflow as tf

 from .configuration_transfo_xl import TransfoXLConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask
 from .modeling_tf_utils import (
    TFPreTrainedModel,
@@ -36,6 +36,8 @@ from .tokenization_utils import BatchEncoding

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "TransfoXLTokenizer"
+
 TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "transfo-xl-wt103",
    # See all Transformer XL models at https://huggingface.co/models?filter=transfo-xl
@@ -722,6 +724,7 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
        self.transformer = TFTransfoXLMainLayer(config, name="transformer")

    @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103")
    def call(self, inputs, **kwargs):
        r"""
    Return:
@@ -743,18 +746,6 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import TransfoXLTokenizer, TFTransfoXLModel
-
-        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-        model = TFTransfoXLModel.from_pretrained('transfo-xl-wt103')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states, mems = outputs[:2]
-
        """
        outputs = self.transformer(inputs, **kwargs)
        return outputs
@@ -811,6 +802,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
        return self.transformer.init_mems(bsz)

    @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103")
    def call(
        self,
        inputs,
@@ -842,18 +834,6 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import TransfoXLTokenizer, TFTransfoXLLMHeadModel
-
-        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-        model = TFTransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores, mems = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
@@ -863,7 +843,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
            labels = inputs[4] if len(inputs) > 4 else labels
            output_attentions = inputs[5] if len(inputs) > 5 else output_attentions
            assert len(inputs) <= 6, "Too many inputs."
-        elif isinstance(inputs, dict):
+        elif isinstance(inputs, (BatchEncoding, dict)):
            input_ids = inputs.get("input_ids")
            mems = inputs.get("mems", mems)
            head_mask = inputs.get("head_mask", head_mask)
--- a/src/transformers/modeling_tf_xlm.py
+++ b/src/transformers/modeling_tf_xlm.py
@@ -24,7 +24,12 @@ import numpy as np
 import tensorflow as tf

 from .configuration_xlm import XLMConfig
-from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_tf_utils import (
    TFMultipleChoiceLoss,
    TFPreTrainedModel,
@@ -43,6 +48,8 @@ from .tokenization_utils import BatchEncoding

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "XLMTokenizer"
+
 TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "xlm-mlm-en-2048",
    "xlm-mlm-ende-1024",
@@ -608,6 +615,7 @@ class TFXLMModel(TFXLMPreTrainedModel):
        self.transformer = TFXLMMainLayer(config, name="transformer")

    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
    def call(self, inputs, **kwargs):
        r"""
    Return:
@@ -625,18 +633,6 @@ class TFXLMModel(TFXLMPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLMTokenizer, TFXLMModel
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = TFXLMModel.from_pretrained('xlm-mlm-en-2048')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        outputs = self.transformer(inputs, **kwargs)
        return outputs
@@ -704,6 +700,7 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
        return {"inputs": inputs, "langs": langs}

    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
    def call(self, inputs, **kwargs):
        r"""
    Return:
@@ -721,18 +718,6 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLMTokenizer, TFXLMWithLMHeadModel
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = TFXLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        transformer_outputs = self.transformer(inputs, **kwargs)

@@ -757,6 +742,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat
        self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")

    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
    def call(
        self,
        inputs=None,
@@ -795,19 +781,6 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLMTokenizer, TFXLMForSequenceClassification
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = TFXLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[11] if len(inputs) > 11 else labels
@@ -865,6 +838,7 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}

    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
    def call(
        self,
        inputs,
@@ -876,9 +850,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
        cache=None,
        head_mask=None,
        inputs_embeds=None,
-        labels=None,
        output_attentions=None,
        output_hidden_states=None,
+        labels=None,
        training=False,
    ):
        r"""
@@ -904,22 +878,6 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLMTokenizer, TFXLMForMultipleChoice
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = TFXLMForMultipleChoice.from_pretrained('xlm-mlm-en-2048')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-
-        input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
-        labels = tf.reshape(tf.constant(1), (-1, 1))
-        outputs = model(input_ids, labels=labels)
-
-        loss, classification_scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
@@ -932,7 +890,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
            head_mask = inputs[7] if len(inputs) > 7 else head_mask
            inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
            output_attentions = inputs[9] if len(inputs) > 9 else output_attentions
-            assert len(inputs) <= 10, "Too many inputs."
+            output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states
+            labels = inputs[11] if len(inputs) > 11 else labels
+            assert len(inputs) <= 11, "Too many inputs."
        elif isinstance(inputs, (dict, BatchEncoding)):
            input_ids = inputs.get("input_ids")
            attention_mask = inputs.get("attention_mask", attention_mask)
@@ -944,7 +904,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
            head_mask = inputs.get("head_mask", head_mask)
            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
            output_attentions = inputs.get("output_attentions", output_attentions)
-            assert len(inputs) <= 10, "Too many inputs."
+            output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
+            labels = inputs.get("labels", labels)
+            assert len(inputs) <= 12, "Too many inputs."
        else:
            input_ids = inputs

@@ -1001,13 +963,14 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
        self.transformer = TFXLMMainLayer(config, name="transformer")
        self.dropout = tf.keras.layers.Dropout(config.dropout)
        self.classifier = tf.keras.layers.Dense(
-            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+            config.num_labels, kernel_initializer=get_initializer(config.init_std), name="classifier"
        )

    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
    def call(
        self,
-        input_ids=None,
+        inputs=None,
        attention_mask=None,
        langs=None,
        token_type_ids=None,
@@ -1016,9 +979,9 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
        cache=None,
        head_mask=None,
        inputs_embeds=None,
-        labels=None,
        output_attentions=None,
        output_hidden_states=None,
+        labels=None,
        training=False,
    ):
        r"""
@@ -1041,25 +1004,22 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLMTokenizer, TFXLMForTokenClassification
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = TFXLMForTokenClassification.from_pretrained('xlm-mlm-en-2048')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
        """
+        if isinstance(inputs, (tuple, list)):
+            labels = inputs[11] if len(inputs) > 11 else labels
+            if len(inputs) > 11:
+                inputs = inputs[:11]
+        elif isinstance(inputs, (dict, BatchEncoding)):
+            labels = inputs.pop("labels", labels)
+
        transformer_outputs = self.transformer(
-            input_ids,
+            inputs,
            attention_mask=attention_mask,
+            langs=langs,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
@@ -1072,7 +1032,7 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
        sequence_output = self.dropout(sequence_output, training=training)
        logits = self.classifier(sequence_output)

-        outputs = (logits,) + transformer_outputs[2:]  # add hidden states and attention if they are here
+        outputs = (logits,) + transformer_outputs[1:]  # add hidden states and attention if they are here

        if labels is not None:
            loss = self.compute_loss(labels, logits)
@@ -1095,6 +1055,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
        )

    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
    def call(
        self,
        inputs=None,
@@ -1139,21 +1100,6 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLMTokenizer, TFXLMForQuestionAnsweringSimple
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = TFXLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
-        start_scores, end_scores = model(input_dict)
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
-
        """
        if isinstance(inputs, (tuple, list)):
            start_positions = inputs[11] if len(inputs) > 11 else start_positions
--- a/src/transformers/modeling_tf_xlnet.py
+++ b/src/transformers/modeling_tf_xlnet.py
@@ -23,7 +23,12 @@ import numpy as np
 import tensorflow as tf

 from .configuration_xlnet import XLNetConfig
-from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_tf_utils import (
    TFMultipleChoiceLoss,
    TFPreTrainedModel,
@@ -42,6 +47,8 @@ from .tokenization_utils import BatchEncoding

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "XLNetTokenizer"
+
 TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "xlnet-base-cased",
    "xlnet-large-cased",
@@ -832,6 +839,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
        self.transformer = TFXLNetMainLayer(config, name="transformer")

    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
    def call(self, inputs, **kwargs):
        r"""
    Return:
@@ -853,18 +861,6 @@ class TFXLNetModel(TFXLNetPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLNetTokenizer, TFXLNetModel
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = TFXLNetModel.from_pretrained('xlnet-large-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        outputs = self.transformer(inputs, **kwargs)
        return outputs
@@ -949,10 +945,13 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):

        # We show how to setup inputs to predict a next token using a bi-directional context.
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True))[None, :]  # We will predict the masked token
+
        perm_mask = np.zeros((1, input_ids.shape[1], input_ids.shape[1]))
        perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
+
        target_mapping = np.zeros((1, 1, input_ids.shape[1]))  # Shape [1, 1, seq_length] => let's predict one token
        target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
+
        outputs = model(input_ids, perm_mask=tf.constant(perm_mask, dtype=tf.float32), target_mapping=tf.constant(target_mapping, dtype=tf.float32))

        next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
@@ -986,6 +985,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif
        )

    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
    def call(
        self,
        inputs=None,
@@ -1029,19 +1029,6 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLNetTokenizer, TFXLNetForSequenceClassification
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = TFXLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[12] if len(inputs) > 12 else labels
@@ -1105,6 +1092,7 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}

    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
    def call(
        self,
        inputs=None,
@@ -1145,22 +1133,6 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLNetTokenizer, TFXLNetForMultipleChoice
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
-        model = TFXLNetForMultipleChoice.from_pretrained('xlnet-base-cased')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-
-        input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
-        labels = tf.reshape(tf.constant(1), (-1, 1))
-        outputs = model(input_ids, labels=labels)
-
-        loss, classification_scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
@@ -1257,6 +1229,8 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )

+    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
    def call(
        self,
        inputs=None,
@@ -1298,19 +1272,6 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLNetTokenizer, TFXLNetForTokenClassification
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = TFXLNetForTokenClassification.from_pretrained('xlnet-large-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[12] if len(inputs) > 12 else labels
@@ -1361,6 +1322,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
        )

    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
    def call(
        self,
        inputs=None,
@@ -1412,21 +1374,6 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLNetTokenizer, TFXLNetForQuestionAnsweringSimple
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
-        model = TFXLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
-        start_scores, end_scores = model(input_dict)
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
-
        """
        if isinstance(inputs, (tuple, list)):
            start_positions = inputs[12] if len(inputs) > 12 else start_positions
--- a/src/transformers/modeling_transfo_xl.py
+++ b/src/transformers/modeling_transfo_xl.py
@@ -27,13 +27,15 @@ import torch.nn as nn
 import torch.nn.functional as F

 from .configuration_transfo_xl import TransfoXLConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax
 from .modeling_utils import PreTrainedModel


 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "TransfoXLTokenizer"
+
 TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "transfo-xl-wt103",
    # See all Transformer XL models at https://huggingface.co/models?filter=transfo-xl
@@ -749,6 +751,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
        return new_mems

    @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103")
    def forward(
        self,
        input_ids=None,
@@ -778,18 +781,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import TransfoXLTokenizer, TransfoXLModel
-        import torch
-
-        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-        model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states, mems = outputs[:2]
-
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
@@ -945,6 +936,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
        return self.transformer.init_mems(bsz)

    @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103")
    def forward(
        self,
        input_ids=None,
@@ -984,18 +976,6 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import TransfoXLTokenizer, TransfoXLLMHeadModel
-        import torch
-
-        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-        model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores, mems = outputs[:2]
-
        """
        if input_ids is not None:
            bsz, tgt_len = input_ids.size(0), input_ids.size(1)
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -978,13 +978,15 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):

        Examples::

+            from transformers import AutoTokenizer, AutoModelForCausalLM
+
            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
+            model = AutoModelForCausalLM.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
            outputs = model.generate(max_length=40)  # do greedy decoding
            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))

            tokenizer = AutoTokenizer.from_pretrained('openai-gpt')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('openai-gpt')    # Download model and configuration from S3 and cache.
+            model = AutoModelForCausalLM.from_pretrained('openai-gpt')    # Download model and configuration from S3 and cache.
            input_context = 'The dog'
            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
            outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5)  # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
@@ -992,22 +994,22 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
                print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))

            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
+            model = AutoModelForCausalLM.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
            input_context = 'The dog'
            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
-            outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3)  # 3 generate sequences using by sampling
+            outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3, do_sample=True)  # 3 generate sequences using by sampling
            for i in range(3): #  3 output sequences were generated
                print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))

            tokenizer = AutoTokenizer.from_pretrained('ctrl')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('ctrl')    # Download model and configuration from S3 and cache.
+            model = AutoModelForCausalLM.from_pretrained('ctrl')    # Download model and configuration from S3 and cache.
            input_context = 'Legal My neighbor is'  # "Legal" is one of the control codes for ctrl
            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
            outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2)  # generate sequences
            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))

            tokenizer = AutoTokenizer.from_pretrained('gpt2')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('gpt2')    # Download model and configuration from S3 and cache.
+            model = AutoModelForCausalLM.from_pretrained('gpt2')    # Download model and configuration from S3 and cache.
            input_context = 'My cute dog'  # "Legal" is one of the control codes for ctrl
            bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']]
            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
--- a/src/transformers/modeling_xlm.py
+++ b/src/transformers/modeling_xlm.py
@@ -28,7 +28,7 @@ from torch.nn import functional as F

 from .activations import gelu
 from .configuration_xlm import XLMConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import (
    PreTrainedModel,
    SequenceSummary,
@@ -40,6 +40,8 @@ from .modeling_utils import (

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "XLMTokenizer"
+
 XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "xlm-mlm-en-2048",
    "xlm-mlm-ende-1024",
@@ -395,6 +397,7 @@ class XLMModel(XLMPreTrainedModel):
            self.attentions[layer].prune_heads(heads)

    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
    def forward(
        self,
        input_ids=None,
@@ -425,18 +428,6 @@ class XLMModel(XLMPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import XLMTokenizer, XLMModel
-        import torch
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = XLMModel.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
@@ -632,6 +623,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
        return {"input_ids": input_ids, "langs": langs}

    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
    def forward(
        self,
        input_ids=None,
@@ -672,18 +664,6 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import XLMTokenizer, XLMWithLMHeadModel
-        import torch
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        transformer_outputs = self.transformer(
            input_ids,
@@ -722,6 +702,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
    def forward(
        self,
        input_ids=None,
@@ -761,19 +742,6 @@ class XLMForSequenceClassification(XLMPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import XLMTokenizer, XLMForSequenceClassification
-        import torch
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = XLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
        """
        transformer_outputs = self.transformer(
            input_ids,
@@ -822,6 +790,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
    def forward(
        self,
        input_ids=None,
@@ -867,20 +836,6 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import XLMTokenizer, XLMForQuestionAnsweringSimple
-        import torch
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = XLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        start_positions = torch.tensor([1])
-        end_positions = torch.tensor([3])
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss = outputs[0]
-
        """
        transformer_outputs = self.transformer(
            input_ids,
@@ -1006,19 +961,20 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

-    Examples::
+    Example::

-        from transformers import XLMTokenizer, XLMForQuestionAnswering
-        import torch
+        >>> from transformers import XLMTokenizer, XLMForQuestionAnswering
+        >>> import torch

-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        start_positions = torch.tensor([1])
-        end_positions = torch.tensor([3])
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss = outputs[0]
+        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        >>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048')

+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        >>> start_positions = torch.tensor([1])
+        >>> end_positions = torch.tensor([3])
+
+        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        >>> loss = outputs[0]
        """
        transformer_outputs = self.transformer(
            input_ids,
@@ -1067,6 +1023,7 @@ class XLMForTokenClassification(XLMPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
    def forward(
        self,
        input_ids=None,
@@ -1074,6 +1031,8 @@ class XLMForTokenClassification(XLMPreTrainedModel):
        langs=None,
        token_type_ids=None,
        position_ids=None,
+        lengths=None,
+        cache=None,
        head_mask=None,
        labels=None,
        output_attentions=None,
@@ -1101,19 +1060,6 @@ class XLMForTokenClassification(XLMPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import XLMTokenizer, XLMForTokenClassification
-        import torch
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280')
-        model = XLMForTokenClassification.from_pretrained('xlm-mlm-100-1280')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
        """
        outputs = self.transformer(
            input_ids,
@@ -1121,6 +1067,8 @@ class XLMForTokenClassification(XLMPreTrainedModel):
            langs=langs,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
--- a/src/transformers/modeling_xlnet.py
+++ b/src/transformers/modeling_xlnet.py
@@ -26,12 +26,14 @@ from torch.nn import functional as F

 from .activations import gelu_new, swish
 from .configuration_xlnet import XLNetConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits, PreTrainedModel, SequenceSummary


 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "XLNetTokenizer"
+
 XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "xlnet-base-cased",
    "xlnet-large-cased",
@@ -749,6 +751,7 @@ class XLNetModel(XLNetPreTrainedModel):
        return pos_emb

    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
    def forward(
        self,
        input_ids=None,
@@ -785,20 +788,6 @@ class XLNetModel(XLNetPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import XLNetTokenizer, XLNetModel
-        import torch
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = XLNetModel.from_pretrained('xlnet-large-cased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=False)).unsqueeze(0)  # Batch size 1
-
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
@@ -1164,6 +1153,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
    def forward(
        self,
        input_ids=None,
@@ -1208,20 +1198,6 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import XLNetTokenizer, XLNetForSequenceClassification
-        import torch
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
        """
        transformer_outputs = self.transformer(
            input_ids,
@@ -1273,6 +1249,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
    def forward(
        self,
        input_ids=None,
@@ -1316,21 +1293,6 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import XLNetTokenizer, XLNetForTokenClassification
-        import torch
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = XLNetForTokenClassification.from_pretrained('xlnet-large-cased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-
-        scores = outputs[0]
-
        """

        outputs = self.transformer(
@@ -1386,6 +1348,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
    def forward(
        self,
        input_ids=None,
@@ -1431,22 +1394,6 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import XLNetTokenizer, XLNetForMultipleChoice
-        import torch
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
-        model = XLNetForMultipleChoice.from_pretrained('xlnet-base-cased')
-
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-        input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
-
-        outputs = model(input_ids, labels=labels)
-        loss, classification_scores = outputs[:2]
-
        """
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

@@ -1508,6 +1455,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
    def forward(
        self,
        input_ids=None,
@@ -1558,22 +1506,6 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import XLNetTokenizer, XLNetForQuestionAnsweringSimple
-        import torch
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
-        model = XLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        start_positions = torch.tensor([1])
-        end_positions = torch.tensor([3])
-
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss = outputs[0]
-
        """

        outputs = self.transformer(
@@ -1705,20 +1637,20 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

-    Examples::
+        Example::

-        from transformers import XLNetTokenizer, XLNetForQuestionAnswering
-        import torch
+            >>> from transformers import XLNetTokenizer, XLNetForQuestionAnswering
+            >>> import torch

-        tokenizer =  XLNetTokenizer.from_pretrained('xlnet-base-cased')
-        model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased')
+            >>> tokenizer =  XLNetTokenizer.from_pretrained('xlnet-base-cased')
+            >>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased')

-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        start_positions = torch.tensor([1])
-        end_positions = torch.tensor([3])
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss = outputs[0]
+            >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+            >>> start_positions = torch.tensor([1])
+            >>> end_positions = torch.tensor([3])
+            >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)

+            >>> loss = outputs[0]
        """
        transformer_outputs = self.transformer(
            input_ids,
--- a/src/transformers/tokenization_bart.py
+++ b/src/transformers/tokenization_bart.py
@@ -66,13 +66,15 @@ class MBartTokenizer(XLMRobertaTokenizer):
    The tokenization method is <tokens> <eos> <language code>. There is no BOS token.

    Examples::
-        from transformers import MBartTokenizer
-        tokenizer = MBartTokenizer.from_pretrained('mbart-large-en-ro')
-        example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
-        expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
-        batch: dict = tokenizer.prepare_translation_batch(
-            example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian
-        )
+
+        >>> from transformers import MBartTokenizer
+        >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-en-ro')
+        >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
+        >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
+        >>> batch: dict = tokenizer.prepare_translation_batch(
+        ...     example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian
+        ... )
+
    """

    vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"}
--- a/src/transformers/tokenization_marian.py
+++ b/src/transformers/tokenization_marian.py
@@ -25,13 +25,13 @@ class MarianTokenizer(PreTrainedTokenizer):

    Examples::

-        from transformers import MarianTokenizer
-        tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
-        src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."]
-        tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."]  # optional
-        batch_enc: BatchEncoding = tok.prepare_translation_batch(src_texts, tgt_texts=tgt_texts)
-        # keys  [input_ids, attention_mask, decoder_input_ids,  decoder_attention_mask].
-        # model(**batch) should work
+        >>> from transformers import MarianTokenizer
+        >>> tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+        >>> src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."]
+        >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."]  # optional
+        >>> batch_enc: BatchEncoding = tok.prepare_translation_batch(src_texts, tgt_texts=tgt_texts)
+        >>> # keys  [input_ids, attention_mask, decoder_input_ids,  decoder_attention_mask].
+        >>> # model(**batch) should work
    """

    vocab_files_names = vocab_files_names
--- a/src/transformers/tokenization_reformer.py
+++ b/src/transformers/tokenization_reformer.py
@@ -81,6 +81,7 @@ class ReformerTokenizer(PreTrainedTokenizer):
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["attention_mask"]

    def __init__(
        self,
--- a/src/transformers/tokenization_t5.py
+++ b/src/transformers/tokenization_t5.py
@@ -94,6 +94,7 @@ class T5Tokenizer(PreTrainedTokenizer):
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["attention_mask"]

    def __init__(
        self,
--- a/tests/test_doc_samples.py
+++ b/tests/test_doc_samples.py
@@ -13,52 +13,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import doctest
+import logging
 import os
 import unittest
+from pathlib import Path
 from typing import List, Union

+import transformers
+
 from .utils import require_tf, require_torch, slow


-def get_examples_from_file(file):
-    examples = []
-    example = []
-    example_mode = False
-    example_indentation = None
-    for i, line in enumerate(file):
-        if example_mode:
-            current_indentation = len(line) - len(line.strip()) - 1
-
-            # Check if the indentation is 0 for the example, so that we don't exit as soon as there's a line return.
-            empty_line = example_indentation == 0 and len(line) == 1
-
-            # If we're back to the example indentation or if it's the end of the docstring.
-            if (current_indentation == example_indentation and not empty_line) or '"""' in line:
-                # Exit the example mode and add the example to the examples list
-                example_mode = False
-                example_indentation = None
-                examples.append(example)
-                example = []
-            else:
-                # If line is not empty, add it to the current example
-                if line != "\n":
-                    example.append(line[example_indentation + 4 : -1])
-
-        # Detect the example from '::' or 'example::'
-        if "example::" in line.lower():
-            example_mode = True
-            example_indentation = line.lower().find("example::")
-        elif "examples::" in line.lower():
-            example_mode = True
-            example_indentation = line.lower().find("examples::")
-        # elif "::" in line.lower() and len(line.strip()) == 2:
-        #     example_mode = True
-        #     example_indentation = line.lower().find("::")
-
-    examples = ["\n".join(example) for example in examples]
-    examples = [example for example in examples if "not runnable" not in example.lower()]
-
-    return examples
+logger = logging.getLogger()


@require_torch
@@ -66,68 +33,81 @@ def get_examples_from_file(file):
@slow
 class TestCodeExamples(unittest.TestCase):
    def analyze_directory(
-        self, directory: str, identifier: Union[str, None] = None, ignore_files: Union[List[str], None] = None
+        self,
+        directory: Path,
+        identifier: Union[str, None] = None,
+        ignore_files: Union[List[str], None] = [],
+        n_identifier: Union[str, None] = None,
+        only_modules: bool = True,
    ):
+        """
+        Runs through the specific directory, looking for the files identified with `identifier`. Executes
+        the doctests in those files
+
+        Args:
+            directory (:obj:`str`): Directory containing the files
+            identifier (:obj:`str`): Will parse files containing this
+            ignore_files (:obj:`List[str]`): List of files to skip
+            n_identifier (:obj:`str` or :obj:`List[str]`): Will not parse files containing this/these identifiers.
+            only_modules (:obj:`bool`): Whether to only analyze modules
+        """
        files = [file for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]

        if identifier is not None:
            files = [file for file in files if identifier in file]

-        if ignore_files is not None:
-            files = [file for file in files if file not in ignore_files]
+        if n_identifier is not None:
+            if isinstance(n_identifier, List):
+                for n_ in n_identifier:
+                    files = [file for file in files if n_ not in file]
+            else:
+                files = [file for file in files if n_identifier not in file]
+
+        ignore_files.append("__init__.py")
+        files = [file for file in files if file not in ignore_files]

        for file in files:
            # Open all files
-            print("Testing", file, end=" ")
-            with open(os.path.join(directory, file)) as f:
-                # Retrieve examples
-                examples = get_examples_from_file(f)
-                joined_examples = []
+            print("Testing", file)

-                def execute_example(code_example):
-                    exec(code_example, {})
-
-                # Some examples are the continuation of others.
-                if len(examples) > 0:
-                    joined_examples.append(examples[0])
-                    joined_examples_index = 0
-                    for example in examples[1:]:
-                        # If they contain this line, then they're a continuation of the previous script
-                        if "# Continuation of the previous script" in example:
-                            joined_examples[joined_examples_index] += "\n" + example
-                        # If not, create a new example and increment the index
-                        else:
-                            joined_examples.append(example)
-                            joined_examples_index += 1
-
-                print(str(len(joined_examples)) + "/" + str(len(joined_examples)))
-
-                # Execute sub tests with every example.
-                for index, code_example in enumerate(joined_examples):
-                    with self.subTest(msg=file + " " + str(index) + "/" + str(len(joined_examples)) + code_example):
-                        execute_example(code_example)
-
-    def test_configuration_examples(self):
-        transformers_directory = "src/transformers"
-        configuration_files = "configuration"
-        ignore_files = ["configuration_auto.py", "configuration_utils.py"]
-        self.analyze_directory(transformers_directory, identifier=configuration_files, ignore_files=ignore_files)
-
-    def test_main_doc_examples(self):
-        doc_directory = "docs/source"
-        ignore_files = ["favicon.ico"]
-        self.analyze_directory(doc_directory, ignore_files=ignore_files)
+            if only_modules:
+                try:
+                    module_identifier = file.split(".")[0]
+                    module_identifier = getattr(transformers, module_identifier)
+                    suite = doctest.DocTestSuite(module_identifier)
+                    result = unittest.TextTestRunner().run(suite)
+                    self.assertIs(len(result.failures), 0)
+                except AttributeError:
+                    logger.info(f"{module_identifier} is not a module.")
+            else:
+                result = doctest.testfile(str(".." / directory / file), optionflags=doctest.ELLIPSIS)
+                self.assertIs(result.failed, 0)

    def test_modeling_examples(self):
        transformers_directory = "src/transformers"
-        modeling_files = "modeling"
+        files = "modeling"
        ignore_files = [
-            "modeling_auto.py",
-            "modeling_t5.py",
-            "modeling_tf_auto.py",
-            "modeling_utils.py",
-            "modeling_tf_t5.py",
-            "modeling_bart.py",
-            "modeling_tf_utils.py",
+            "modeling_ctrl.py",
+            "modeling_tf_ctrl.py",
        ]
-        self.analyze_directory(transformers_directory, identifier=modeling_files, ignore_files=ignore_files)
+        self.analyze_directory(transformers_directory, identifier=files, ignore_files=ignore_files)
+
+    def test_tokenization_examples(self):
+        transformers_directory = Path("src/transformers")
+        files = "tokenization"
+        self.analyze_directory(transformers_directory, identifier=files)
+
+    def test_configuration_examples(self):
+        transformers_directory = Path("src/transformers")
+        files = "configuration"
+        self.analyze_directory(transformers_directory, identifier=files)
+
+    def test_remaining_examples(self):
+        transformers_directory = Path("src/transformers")
+        n_identifiers = ["configuration", "modeling", "tokenization"]
+        self.analyze_directory(transformers_directory, n_identifier=n_identifiers)
+
+    def test_doc_sources(self):
+        doc_source_directory = Path("docs/source")
+        ignore_files = ["favicon.ico"]
+        self.analyze_directory(doc_source_directory, ignore_files=ignore_files, only_modules=False)
--- a/tests/test_modeling_tf_xlm.py
+++ b/tests/test_modeling_tf_xlm.py
@@ -31,6 +31,7 @@ if is_tf_available():
        TFXLMWithLMHeadModel,
        TFXLMForSequenceClassification,
        TFXLMForQuestionAnsweringSimple,
+        TFXLMForTokenClassification,
        TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST,
    )

@@ -219,6 +220,26 @@ class TFXLMModelTester:

        self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size])

+    def create_and_check_xlm_for_token_classification(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        input_mask,
+    ):
+        config.num_labels = self.num_labels
+        model = TFXLMForTokenClassification(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        (logits,) = model(inputs)
+        result = {
+            "logits": logits.numpy(),
+        }
+        self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels])
+
    def prepare_config_and_inputs_for_common(self):
        config_and_inputs = self.prepare_config_and_inputs()
        (
@@ -244,7 +265,14 @@ class TFXLMModelTester:
 class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):

    all_model_classes = (
-        (TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple)
+        # TODO The multiple choice model is missing and should be added.
+        (
+            TFXLMModel,
+            TFXLMWithLMHeadModel,
+            TFXLMForSequenceClassification,
+            TFXLMForQuestionAnsweringSimple,
+            TFXLMForTokenClassification,
+        )
        if is_tf_available()
        else ()
    )
@@ -275,6 +303,10 @@ class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)

+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_for_token_classification(*config_and_inputs)
+
    @slow
    def test_model_from_pretrained(self):
        for model_name in TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: